ROOTプログラミング:ROOT C++とPybind11とNumba cudaの速度比較

このサイトのチュートリアルがかなり良さげなので、このサイトを参考にさせてもらってROOTを使ったC++プログラミングを学習してみようと思う。

スポンサーリンク

ROOT C++ その1

%matplotlib inline
import numpy,matplotlib.pyplot,time

import ROOT
tmpname = "run_cpp1"
ROOT.gInterpreter.Declare("""
#include<complex>
void %s(int height, int width, int maxiterations, size_t c_ptr, size_t fractal_ptr) {
    double* c = (double*)(c_ptr);
    int* fractal = (int*)(fractal_ptr);
    for (int h = 0;  h < height;  h++) {
        for (int w = 0;  w < width;  w++) {
            double creal = c[2 * (h + height*w)];
            double cimag = c[2 * (h + height*w) + 1];
            std::complex<double> ci = std::complex<double>(creal, cimag);
            std::complex<double> z = ci;
            for (int i = 0;  i < maxiterations;  i++) {
                z = z*z + ci;
                if (std::abs(z) > 2) {
                    fractal[h + height*w] = i;
                    break;
} } } } }""" % tmpname)

def run_cpp(height, width, maxiterations=20):
    y, x = numpy.ogrid[-1:0:height*1j, -1.5:0:width*1j]
    c = x + y*1j
    fractal = numpy.full(c.shape, maxiterations, dtype=numpy.int32)
    getattr(ROOT, tmpname)(height, width, maxiterations, c.ctypes.data, fractal.ctypes.data)
    return fractal
Welcome to JupyROOT 6.16/00
starttime = time.time()
fractal = run_cpp(3200, 4800)
print("{0} ns per pixel".format(1e9 * (time.time() - starttime) / (3200 * 4800)))
544.9816739807526 ns per pixel
fig, ax = matplotlib.pyplot.subplots(figsize=(18, 12))
ax.imshow(fractal)
<matplotlib.image.AxesImage at 0x7f8562983630>

ROOT C++ その2

上記の方法はやや複雑なので、下記の方法がより分かりやすいかもしれない。先ず、%%file run.cpp2を使って下のコードをrun.cpp2としてセーブする。

void run_cpp2(int height, int width, int maxiterations, size_t c_ptr, size_t fractal1_ptr) {
    double* c = (double*)(c_ptr);
    int* fractal1 = (int*)(fractal1_ptr);
    for (int h = 0;  h < height;  h++) {
        for (int w = 0;  w < width;  w++) {
            double creal = c[2 * (h + height*w)];
            double cimag = c[2 * (h + height*w) + 1];
            std::complex<double> ci = std::complex<double>(creal, cimag);
            std::complex<double> z = ci;
            for (int i = 0;  i < maxiterations;  i++) {
                z = z*z + ci;
                if (std::abs(z) > 2) {
                    fractal1[h + height*w] = i;
                    break;
                }
            }
        }
    }
}
Overwriting run.cpp2
#%matplotlib inline
#import ROOT,numpy,matplotlib.pyplot,time

ROOT.gROOT.LoadMacro("run.cpp2")
starttime = time.time()
height, width, maxiterations = 3200, 4800, 20
y, x = numpy.ogrid[-1:0:height*1j, -1.5:0:width*1j]
c = x + y*1j
fractal1 = numpy.full(c.shape, maxiterations, dtype=numpy.int32)
ROOT.run_cpp2(height, width, maxiterations, c.ctypes.data, fractal1.ctypes.data)
print("{0} ns per pixel".format(1e9 * (time.time() - starttime) / (3200 * 4800)))
Welcome to JupyROOT 6.16/00
544.5340182632208 ns per pixel
fig, ax = matplotlib.pyplot.subplots(figsize=(18, 12))
ax.imshow(fractal1)
<matplotlib.image.AxesImage at 0x7f5c2f85e4a8>

pybind11

上の2つの方法だと恐ろしく処理が遅いので、下の方法を使用した方がよいようである。先ず、%%writefile tmp.cppで下のコードを保存する。

#include<complex>
#include<pybind11/pybind11.h>

void run_cpp(int height, int width, int maxiterations, size_t c_ptr, size_t fractal_ptr) {
    double* c = (double*)(c_ptr);
    int* fractal = (int*)(fractal_ptr);
    for (int h = 0;  h < height;  h++) {
        for (int w = 0;  w < width;  w++) {
            double creal = c[2 * (h + height*w)];
            double cimag = c[2 * (h + height*w) + 1];
            std::complex<double> ci = std::complex<double>(creal, cimag);
            std::complex<double> z = ci;
            for (int i = 0;  i < maxiterations;  i++) {
                z = z*z + ci;
                if (std::abs(z) > 2) {
                    fractal[h + height*w] = i;
                    break;
                }
            }
        }
    }
}
PYBIND11_MODULE(tmp, m) {
    m.def("run_cpp", &run_cpp, "the inner loop");
}
Overwriting tmp.cpp
!c++ -O3 -ffast-math -Wall -shared -std=c++11 -fPIC `python -m pybind11 --includes` tmp.cpp -o tmp`python3-config --extension-suffix`
import tmp
starttime = time.time()
height, width, maxiterations = 3200, 4800, 20
y, x = numpy.ogrid[-1:0:height*1j, -1.5:0:width*1j]
c = x + y*1j
fractal = numpy.full(c.shape, maxiterations, dtype=numpy.int32)
tmp.run_cpp(height, width, maxiterations, c.ctypes.data, fractal.ctypes.data)
print("{0} ns per pixel".format(1e9 * (time.time() - starttime) / (3200 * 4800)))
69.84719075262547 ns per pixel
fig, ax = matplotlib.pyplot.subplots(figsize=(18, 12))
ax.imshow(fractal)
<matplotlib.image.AxesImage at 0x7f5c4c2de8d0>

こっちの方が約8倍処理が高速だった。

numba.cuda

import numba.cuda
import math

def run_numba(height, width, maxiterations=20):
    fractal = numpy.empty((height, width), dtype=numpy.int32)
    griddim = (math.ceil(height / 32), math.ceil(width / 32))
    blockdim = (32, 32)
    inner_loop[griddim, blockdim](height, width, maxiterations, fractal)
    return fractal

@numba.cuda.jit
def inner_loop(height, width, maxiterations, fractal):
    x, y = numba.cuda.grid(2)                   # 2-dimensional CUDA grid
    z = c = -1.5 + y*1.0/(height + 1) + -1j + x*1j*1.5/(width + 1)
    fractal[x, y] = maxiterations
    for i in range(maxiterations):
        z = z**2 + c
        if abs(z) > 2:
            fractal[x, y] = i
            break
import time, numpy
starttime = time.time()
fractal = run_numba(6400, 9600)
print("{0} ns per pixel".format(1e9 * (time.time() - starttime) / (6400 * 9600)))
7.681195468952258 ns per pixel

こっちの方が約9倍高速だった。

%matplotlib inline
import matplotlib.pyplot

fig, ax = matplotlib.pyplot.subplots(figsize=(18, 12))
ax.imshow(fractal)
<matplotlib.image.AxesImage at 0x7fc4ddf71b38>

numba.cuda + cupy

numpyの代わりにcupyを使ってみる。

import cupy as cp

def run_numba(height, width, maxiterations=20):
    fractal = cp.empty((height, width), dtype=cp.int32)
    griddim = (math.ceil(height / 32), math.ceil(width / 32))
    blockdim = (32, 32)
    inner_loop[griddim, blockdim](height, width, maxiterations, fractal)
    return fractal

@numba.cuda.jit
def inner_loop(height, width, maxiterations, fractal):
    x, y = numba.cuda.grid(2)                   # 2-dimensional CUDA grid
    z = c = -1.5 + y*1.0/(height + 1) + -1j + x*1j*1.5/(width + 1)
    fractal[x, y] = maxiterations
    for i in range(maxiterations):
        z = z**2 + c
        if abs(z) > 2:
            fractal[x, y] = i
            break
starttime = time.time()
fractal = run_numba(6400, 9600)
fractal2=fractal.get()
print("{0} ns per pixel".format(1e9 * (time.time() - starttime) / (6400 * 9600)))
6.490872086336215 ns per pixel
fig, ax = matplotlib.pyplot.subplots(figsize=(18, 12))
ax.imshow(fractal2)
<matplotlib.image.AxesImage at 0x7fc504ab02b0>

こっちの方が約1.2倍高速だった。