# Cython vs Numba.cuda.jit vs C wrapper

numba.jit.cudaを使ったマンデルブロー集合描画とcython、ctypesによるCライブラリ実装バージョンとの描画速度の比較をしてみた。

スポンサーリンク

## マンデルブロ集合描画(Cython編)¶

import math
import numpy as np
from numba import cuda
import matplotlib.pyplot as plt
%matplotlib inline

%load_ext cython

%%cython -a
import numpy as np

size = 400
iterations = 100

def mandelbrot_cython(m, size, iterations):
for i in range(size):
for j in range(size):
c = -2 + 3./size*j + 1j*(1.5-3./size*i)
z = 0
for n in range(iterations):
if np.abs(z) <= 10:
z = z*z + c
m[i, j] = n
else:
break

s = (size, size)

%%timeit -n1 -r1 m = np.zeros(s, dtype=np.int32)
mandelbrot_cython(m, size, iterations)

4.27 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

%%cython -a
import numpy as np

def mandelbrot_cython(int[:,::1] m,
int size,
int iterations):
cdef int i, j, n
cdef complex z, c
for i in range(size):
for j in range(size):
c = -2 + 3./size*j + 1j*(1.5-3./size*i)
z = 0
for n in range(iterations):
if z.real**2 + z.imag**2 <= 100:
z = z*z + c
m[i, j] = n
else:
break

%%timeit -n10 -r100 m = np.zeros(s, dtype=np.int32)
mandelbrot_cython(m, size, iterations)

11.7 ms ± 163 µs per loop (mean ± std. dev. of 100 runs, 10 loops each)

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(np.log(m), cmap=plt.cm.hot)
ax.set_axis_off()

スポンサーリンク

## マンデルブロー集合描画(cuda.jit編)¶

len(cuda.gpus)

1
cuda.gpus[0].name

b'GeForce GTX 1060 with Max-Q Design'
@cuda.jit
def mandelbrot_numba(m, iterations):
# Matrix index.
i, j = cuda.grid(2)
size = m.shape[0]
# Skip threads outside the matrix.
if i >= size or j >= size:
return
# Run the simulation.
c = (-2 + 3. / size * j +
1j * (1.5 - 3. / size * i))
z = 0
for n in range(iterations):
if abs(z) <= 10:
z = z * z + c
m[i, j] = n
else:
break

size = 400
iterations = 100

m = np.zeros((size, size))

# 16x16 threads per block.
bs = 16
# Number of blocks in the grid.
bpg = math.ceil(size / bs)
# We prepare the GPU function.
f = mandelbrot_numba[(bpg, bpg), (bs, bs)]

%timeit -n10 -r100 f(m, iterations)

2.46 ms ± 122 µs per loop (mean ± std. dev. of 100 runs, 10 loops each)

11.7/2.46

4.7560975609756095

numba.cuda.jitはcythonの僅か4.76倍という非常に残念な結果となった。まぁ、最もGPUがしょぼいだけで、TITAN Vとかなら結果はまた違うと思われる。

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(np.log(m), cmap=plt.cm.hot)
ax.set_axis_off()


Numpy arrayのCPUからGPUへのデータ転送速度を計測

%timeit -n10 -r100 cuda.to_device(m)

The slowest run took 5.92 times longer than the fastest. This could mean that an intermediate result is being cached.
421 µs ± 135 µs per loop (mean ± std. dev. of 100 runs, 10 loops each)


%%timeit -n10 -r100 m_gpu = cuda.to_device(m)
f(m_gpu, iterations)

The slowest run took 138.56 times longer than the fastest. This could mean that an intermediate result is being cached.
205 µs ± 1.13 ms per loop (mean ± std. dev. of 100 runs, 10 loops each)


m_gpu = cuda.to_device(m)
%timeit -n10 -r100 m_gpu.copy_to_host()

218 µs ± 50.8 µs per loop (mean ± std. dev. of 100 runs, 10 loops each)


GPUがデータ処理に費やす時間は0.3〜0.5msなのに対し、データ転送にその数倍の時間を費やしていることが分かった。

スポンサーリンク

## マンデルブロ描画(ctypes編)¶

%%writefile mandelbrot.c
#include "stdio.h"
#include "stdlib.h"

void mandelbrot(int size, int iterations, int *col)
{
// Variable declarations.
int i, j, n, index;
double cx, cy;
double z0, z1, z0_tmp, z0_2, z1_2;

// Loop within the grid.
for (i = 0; i < size; i++)
{
cy = -1.5 + (double)i / size * 3;
for (j = 0; j < size; j++)
{
// We initialize the loop of the system.
cx = -2.0 + (double)j / size * 3;
index = i * size + j;
// Let's run the system.
z0 = 0.0;
z1 = 0.0;
for (n = 0; n < iterations; n++)
{
z0_2 = z0 * z0;
z1_2 = z1 * z1;
if (z0_2 + z1_2 <= 100)
{
// Update the system.
z0_tmp = z0_2 - z1_2 + cx;
z1 = 2 * z0 * z1 + cy;
z0 = z0_tmp;
col[index] = n;
}
else
{
break;
}
}
}
}
}

Overwriting mandelbrot.c

!gcc -shared -Wl,-soname,mandelbrot \
-o mandelbrot.so \
-fPIC mandelbrot.c

import ctypes

lib = ctypes.CDLL('./mandelbrot.so')

mandelbrot = lib.mandelbrot

from numpy.ctypeslib import ndpointer

# Define the types of the output and arguments of
# this function.
mandelbrot.restype = None
mandelbrot.argtypes = [ctypes.c_int,
ctypes.c_int,
ndpointer(ctypes.c_int),
]

import numpy as np
# We initialize an empty array.
size = 400
iterations = 100
col = np.empty((size, size), dtype=np.int32)
# We execute the C function, which will update
# the array.
mandelbrot(size, iterations, col)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

%timeit -n10 -r100 mandelbrot(size, iterations, col)

27 ms ± 207 µs per loop (mean ± std. dev. of 100 runs, 10 loops each)

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(np.log(col), cmap=plt.cm.hot)
ax.set_axis_off()


ここまでやってcythonよりも遅い結果となった。

スポンサーリンク
スポンサーリンク

フォローする