Cython vs Numba.cuda.jit vs C wrapper

numba.jit.cudaを使ったマンデルブロー集合描画とcython、ctypesによるCライブラリ実装バージョンとの描画速度の比較をしてみた。

スポンサーリンク

マンデルブロ集合描画(Cython編)

import math
import numpy as np
from numba import cuda
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext cython
%%cython -a
import numpy as np

size = 400
iterations = 100

def mandelbrot_cython(m, size, iterations):
    for i in range(size):
        for j in range(size):
            c = -2 + 3./size*j + 1j*(1.5-3./size*i)
            z = 0
            for n in range(iterations):
                if np.abs(z) <= 10:
                    z = z*z + c
                    m[i, j] = n
                else:
                    break
s = (size, size)
%%timeit -n1 -r1 m = np.zeros(s, dtype=np.int32)
mandelbrot_cython(m, size, iterations)
4.27 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%cython -a
import numpy as np

def mandelbrot_cython(int[:,::1] m,
                      int size,
                      int iterations):
    cdef int i, j, n
    cdef complex z, c
    for i in range(size):
        for j in range(size):
            c = -2 + 3./size*j + 1j*(1.5-3./size*i)
            z = 0
            for n in range(iterations):
                if z.real**2 + z.imag**2 <= 100:
                    z = z*z + c
                    m[i, j] = n
                else:
                    break
%%timeit -n10 -r100 m = np.zeros(s, dtype=np.int32)
mandelbrot_cython(m, size, iterations)
11.7 ms ± 163 µs per loop (mean ± std. dev. of 100 runs, 10 loops each)
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(np.log(m), cmap=plt.cm.hot)
ax.set_axis_off()

マンデルブロー集合描画(cuda.jit編)

len(cuda.gpus)
1
cuda.gpus[0].name
b'GeForce GTX 1060 with Max-Q Design'
@cuda.jit
def mandelbrot_numba(m, iterations):
    # Matrix index.
    i, j = cuda.grid(2)
    size = m.shape[0]
    # Skip threads outside the matrix.
    if i >= size or j >= size:
        return
    # Run the simulation.
    c = (-2 + 3. / size * j +
         1j * (1.5 - 3. / size * i))
    z = 0
    for n in range(iterations):
        if abs(z) <= 10:
            z = z * z + c
            m[i, j] = n
        else:
            break
size = 400
iterations = 100

m = np.zeros((size, size))
# 16x16 threads per block.
bs = 16
# Number of blocks in the grid.
bpg = math.ceil(size / bs)
# We prepare the GPU function.
f = mandelbrot_numba[(bpg, bpg), (bs, bs)]
%timeit -n10 -r100 f(m, iterations)
2.46 ms ± 122 µs per loop (mean ± std. dev. of 100 runs, 10 loops each)
11.7/2.46
4.7560975609756095

numba.cuda.jitはcythonの僅か4.76倍という非常に残念な結果となった。まぁ、最もGPUがしょぼいだけで、TITAN Vとかなら結果はまた違うと思われる。

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(np.log(m), cmap=plt.cm.hot)
ax.set_axis_off()

Numpy arrayのCPUからGPUへのデータ転送速度を計測

%timeit -n10 -r100 cuda.to_device(m)
The slowest run took 5.92 times longer than the fastest. This could mean that an intermediate result is being cached.
421 µs ± 135 µs per loop (mean ± std. dev. of 100 runs, 10 loops each)

転送されたnumpy array dataをGPUが処理する時間を計測。980mに大きく負けているのが何とも情けない。

%%timeit -n10 -r100 m_gpu = cuda.to_device(m)
f(m_gpu, iterations)
The slowest run took 138.56 times longer than the fastest. This could mean that an intermediate result is being cached.
205 µs ± 1.13 ms per loop (mean ± std. dev. of 100 runs, 10 loops each)

処理されたデータをGPUからCPUに戻すのにかかる時間を計測

m_gpu = cuda.to_device(m)
%timeit -n10 -r100 m_gpu.copy_to_host()
218 µs ± 50.8 µs per loop (mean ± std. dev. of 100 runs, 10 loops each)

GPUがデータ処理に費やす時間は0.3〜0.5msなのに対し、データ転送にその数倍の時間を費やしていることが分かった。

マンデルブロ描画(ctypes編)

%%writefile mandelbrot.c
#include "stdio.h"
#include "stdlib.h"

void mandelbrot(int size, int iterations, int *col)
{
    // Variable declarations.
    int i, j, n, index;
    double cx, cy;
    double z0, z1, z0_tmp, z0_2, z1_2;

    // Loop within the grid.
    for (i = 0; i < size; i++)
    {
        cy = -1.5 + (double)i / size * 3;
        for (j = 0; j < size; j++)
        {
            // We initialize the loop of the system.
            cx = -2.0 + (double)j / size * 3;
            index = i * size + j;
            // Let's run the system.
            z0 = 0.0;
            z1 = 0.0;
            for (n = 0; n < iterations; n++)
            {
                z0_2 = z0 * z0;
                z1_2 = z1 * z1;
                if (z0_2 + z1_2 <= 100)
                {
                    // Update the system.
                    z0_tmp = z0_2 - z1_2 + cx;
                    z1 = 2 * z0 * z1 + cy;
                    z0 = z0_tmp;
                    col[index] = n;
                }
                else
                {
                    break;
                }
            }
        }
    }
}
Overwriting mandelbrot.c
!gcc -shared -Wl,-soname,mandelbrot \
    -o mandelbrot.so \
    -fPIC mandelbrot.c
import ctypes
lib = ctypes.CDLL('./mandelbrot.so')
mandelbrot = lib.mandelbrot
from numpy.ctypeslib import ndpointer
# Define the types of the output and arguments of
# this function.
mandelbrot.restype = None
mandelbrot.argtypes = [ctypes.c_int,
                       ctypes.c_int,
                       ndpointer(ctypes.c_int),
                       ]
import numpy as np
# We initialize an empty array.
size = 400
iterations = 100
col = np.empty((size, size), dtype=np.int32)
# We execute the C function, which will update
# the array.
mandelbrot(size, iterations, col)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%timeit -n10 -r100 mandelbrot(size, iterations, col)
27 ms ± 207 µs per loop (mean ± std. dev. of 100 runs, 10 loops each)
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(np.log(col), cmap=plt.cm.hot)
ax.set_axis_off()

ここまでやってcythonよりも遅い結果となった。

参考サイトhttps://github.com/