# ROOTプログラミング：ROOT C++とPybind11とNumba cudaの速度比較

このサイトのチュートリアルがかなり良さげなので、このサイトを参考にさせてもらってROOTを使ったC++プログラミングを学習してみようと思う。

スポンサーリンク

## ROOT C++ その１¶

%matplotlib inline
import numpy,matplotlib.pyplot,time

import ROOT
tmpname = "run_cpp1"
ROOT.gInterpreter.Declare("""
#include<complex>
void %s(int height, int width, int maxiterations, size_t c_ptr, size_t fractal_ptr) {
double* c = (double*)(c_ptr);
int* fractal = (int*)(fractal_ptr);
for (int h = 0;  h < height;  h++) {
for (int w = 0;  w < width;  w++) {
double creal = c[2 * (h + height*w)];
double cimag = c[2 * (h + height*w) + 1];
std::complex<double> ci = std::complex<double>(creal, cimag);
std::complex<double> z = ci;
for (int i = 0;  i < maxiterations;  i++) {
z = z*z + ci;
if (std::abs(z) > 2) {
fractal[h + height*w] = i;
break;
} } } } }""" % tmpname)

def run_cpp(height, width, maxiterations=20):
y, x = numpy.ogrid[-1:0:height*1j, -1.5:0:width*1j]
c = x + y*1j
fractal = numpy.full(c.shape, maxiterations, dtype=numpy.int32)
getattr(ROOT, tmpname)(height, width, maxiterations, c.ctypes.data, fractal.ctypes.data)
return fractal

Welcome to JupyROOT 6.16/00

starttime = time.time()
fractal = run_cpp(3200, 4800)
print("{0} ns per pixel".format(1e9 * (time.time() - starttime) / (3200 * 4800)))

544.9816739807526 ns per pixel

fig, ax = matplotlib.pyplot.subplots(figsize=(18, 12))
ax.imshow(fractal)

<matplotlib.image.AxesImage at 0x7f8562983630>
スポンサーリンク

## ROOT C++ その２¶

void run_cpp2(int height, int width, int maxiterations, size_t c_ptr, size_t fractal1_ptr) {
double* c = (double*)(c_ptr);
int* fractal1 = (int*)(fractal1_ptr);
for (int h = 0;  h < height;  h++) {
for (int w = 0;  w < width;  w++) {
double creal = c[2 * (h + height*w)];
double cimag = c[2 * (h + height*w) + 1];
std::complex<double> ci = std::complex<double>(creal, cimag);
std::complex<double> z = ci;
for (int i = 0;  i < maxiterations;  i++) {
z = z*z + ci;
if (std::abs(z) > 2) {
fractal1[h + height*w] = i;
break;
}
}
}
}
}

Overwriting run.cpp2

#%matplotlib inline
#import ROOT,numpy,matplotlib.pyplot,time

starttime = time.time()
height, width, maxiterations = 3200, 4800, 20
y, x = numpy.ogrid[-1:0:height*1j, -1.5:0:width*1j]
c = x + y*1j
fractal1 = numpy.full(c.shape, maxiterations, dtype=numpy.int32)
ROOT.run_cpp2(height, width, maxiterations, c.ctypes.data, fractal1.ctypes.data)
print("{0} ns per pixel".format(1e9 * (time.time() - starttime) / (3200 * 4800)))

Welcome to JupyROOT 6.16/00
544.5340182632208 ns per pixel

fig, ax = matplotlib.pyplot.subplots(figsize=(18, 12))
ax.imshow(fractal1)

<matplotlib.image.AxesImage at 0x7f5c2f85e4a8>
スポンサーリンク

## pybind11¶

#include<complex>
#include<pybind11/pybind11.h>

void run_cpp(int height, int width, int maxiterations, size_t c_ptr, size_t fractal_ptr) {
double* c = (double*)(c_ptr);
int* fractal = (int*)(fractal_ptr);
for (int h = 0;  h < height;  h++) {
for (int w = 0;  w < width;  w++) {
double creal = c[2 * (h + height*w)];
double cimag = c[2 * (h + height*w) + 1];
std::complex<double> ci = std::complex<double>(creal, cimag);
std::complex<double> z = ci;
for (int i = 0;  i < maxiterations;  i++) {
z = z*z + ci;
if (std::abs(z) > 2) {
fractal[h + height*w] = i;
break;
}
}
}
}
}
PYBIND11_MODULE(tmp, m) {
m.def("run_cpp", &run_cpp, "the inner loop");
}

Overwriting tmp.cpp

!c++ -O3 -ffast-math -Wall -shared -std=c++11 -fPIC python -m pybind11 --includes tmp.cpp -o tmppython3-config --extension-suffix

import tmp
starttime = time.time()
height, width, maxiterations = 3200, 4800, 20
y, x = numpy.ogrid[-1:0:height*1j, -1.5:0:width*1j]
c = x + y*1j
fractal = numpy.full(c.shape, maxiterations, dtype=numpy.int32)
tmp.run_cpp(height, width, maxiterations, c.ctypes.data, fractal.ctypes.data)
print("{0} ns per pixel".format(1e9 * (time.time() - starttime) / (3200 * 4800)))

69.84719075262547 ns per pixel

fig, ax = matplotlib.pyplot.subplots(figsize=(18, 12))
ax.imshow(fractal)

<matplotlib.image.AxesImage at 0x7f5c4c2de8d0>

こっちの方が約8倍処理が高速だった。

スポンサーリンク

## numba.cuda¶

import numba.cuda
import math

def run_numba(height, width, maxiterations=20):
fractal = numpy.empty((height, width), dtype=numpy.int32)
griddim = (math.ceil(height / 32), math.ceil(width / 32))
blockdim = (32, 32)
inner_loop[griddim, blockdim](height, width, maxiterations, fractal)
return fractal

@numba.cuda.jit
def inner_loop(height, width, maxiterations, fractal):
x, y = numba.cuda.grid(2)                   # 2-dimensional CUDA grid
z = c = -1.5 + y*1.0/(height + 1) + -1j + x*1j*1.5/(width + 1)
fractal[x, y] = maxiterations
for i in range(maxiterations):
z = z**2 + c
if abs(z) > 2:
fractal[x, y] = i
break

import time, numpy
starttime = time.time()
fractal = run_numba(6400, 9600)
print("{0} ns per pixel".format(1e9 * (time.time() - starttime) / (6400 * 9600)))

7.681195468952258 ns per pixel


こっちの方が約９倍高速だった。

%matplotlib inline
import matplotlib.pyplot

fig, ax = matplotlib.pyplot.subplots(figsize=(18, 12))
ax.imshow(fractal)

<matplotlib.image.AxesImage at 0x7fc4ddf71b38>
スポンサーリンク

## numba.cuda + cupy¶

numpyの代わりにcupyを使ってみる。

import cupy as cp

def run_numba(height, width, maxiterations=20):
fractal = cp.empty((height, width), dtype=cp.int32)
griddim = (math.ceil(height / 32), math.ceil(width / 32))
blockdim = (32, 32)
inner_loop[griddim, blockdim](height, width, maxiterations, fractal)
return fractal

@numba.cuda.jit
def inner_loop(height, width, maxiterations, fractal):
x, y = numba.cuda.grid(2)                   # 2-dimensional CUDA grid
z = c = -1.5 + y*1.0/(height + 1) + -1j + x*1j*1.5/(width + 1)
fractal[x, y] = maxiterations
for i in range(maxiterations):
z = z**2 + c
if abs(z) > 2:
fractal[x, y] = i
break

starttime = time.time()
fractal = run_numba(6400, 9600)
fractal2=fractal.get()
print("{0} ns per pixel".format(1e9 * (time.time() - starttime) / (6400 * 9600)))

6.490872086336215 ns per pixel

fig, ax = matplotlib.pyplot.subplots(figsize=(18, 12))
ax.imshow(fractal2)

<matplotlib.image.AxesImage at 0x7fc504ab02b0>

こっちの方が約1.2倍高速だった。

スポンサーリンク
スポンサーリンク

フォローする