# pyopencl vs cython vs numba guvectorize

python, numpy, numba, cython, pyopenclのマンデルブロ集合描画時間の計測をしてみたところ、非常に意外な結果になった。

スポンサーリンク

## マンデルブロ描画(python編)¶

import numpy as np
from matplotlib import pyplot as plt
from matplotlib import colors
%matplotlib inline

def mandelbrot_image(xmin,xmax,ymin,ymax,width=3,height=3,maxiter=80,cmap='hot'):
dpi = 72
img_width = dpi * width
img_height = dpi * height
x,y,z = mandelbrot_set(xmin,xmax,ymin,ymax,img_width,img_height,maxiter)

fig, ax = plt.subplots(figsize=(width, height),dpi=72)
ticks = np.arange(0,img_width,3*dpi)
x_ticks = xmin + (xmax-xmin)*ticks/img_width
plt.xticks(ticks, x_ticks)
y_ticks = ymin + (ymax-ymin)*ticks/img_width
plt.yticks(ticks, y_ticks)

norm = colors.PowerNorm(0.3)
ax.imshow(z.T,cmap=cmap,origin='lower',norm=norm)

def mandelbrot(z,maxiter):
c = z
for n in range(maxiter):
if abs(z) > 2:
return n
z = z*z + c
return maxiter

def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
return (r1,r2,[mandelbrot(complex(r, i),maxiter) for r in r1 for i in r2])

%timeit -n1 -r1 mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit -n1 -r1 mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)

3.48 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1min 26s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

スポンサーリンク

## マンデルブロ集合描画(Numpy編)¶

def mandelbrot(c,maxiter):
z = c
for n in range(maxiter):
if abs(z) > 2:
return n
z = z*z + c
return 0

def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
n3 = np.empty((width,height))
for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i] + 1j*r2[j],maxiter)
return (r1,r2,n3)

%timeit -n1 -r1 mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit -n1 -r1 mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)

9.22 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
3min 10s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')

スポンサーリンク

## マンデルブロ集合描画(numba編)¶

from numba import jit

@jit
def mandelbrot(c,maxiter):
z = c
for n in range(maxiter):
if abs(z) > 2:
return n
z = z*z + c
return 0

@jit
def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
n3 = np.empty((width,height))
for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i] + 1j*r2[j],maxiter)
return (r1,r2,n3)

%timeit -n1 -r1 mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit -n1 -r1 mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)

957 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
5.13 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')


### マンデルブロー描画(numba.jit改良版)¶

absを使ってsquare root computation(平方根計算)を除去

@jit
def mandelbrot(c,maxiter):
z = c
for n in range(maxiter):
if z.real * z.real + z.imag * z.imag > 4.0:
return n
z = z*z + c
return 0

@jit
def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
n3 = np.empty((width,height))
for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i] + 1j*r2[j],maxiter)
return (r1,r2,n3)

%timeit -n1 -r1 mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit -n1 -r1 mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)

242 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')


complexを2つのfloatに分解

@jit
def mandelbrot(creal,cimag,maxiter):
real = creal
imag = cimag
for n in range(maxiter):
real2 = real*real
imag2 = imag*imag
if real2 + imag2 > 4.0:
return n
imag = 2* real*imag + cimag
real = real2 - imag2 + creal
return 0

@jit
def mandelbrot_set4(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width)
r2 = np.linspace(ymin, ymax, height)
n3 = np.empty((width,height))
for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i],r2[j],maxiter)
return (r1,r2,n3)

%timeit -n1 -r1 mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit -n1 -r1 mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)

90.6 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')

スポンサーリンク

## マンデルブロー集合描画(cython編)¶

%load_ext cython

%%cython
import cython
import numpy as np

cdef int mandelbrot(double creal, double cimag, int maxiter):
cdef:
double real2, imag2
double real = creal, imag = cimag
int n

for n in range(maxiter):
real2 = real*real
imag2 = imag*imag
if real2 + imag2 > 4.0:
return n
imag = 2* real*imag + cimag
real = real2 - imag2 + creal;
return 0

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef mandelbrot_set(double xmin, double xmax, double ymin, double ymax, long width, long height, int maxiter):
cdef:
double[:] r1 = np.linspace(xmin, xmax, width)
double[:] r2 = np.linspace(ymin, ymax, height)
long[:,:] n3 = np.empty((width,height), np.int)
int i,j

for i in range(width):
for j in range(height):
n3[i,j] = mandelbrot(r1[i], r2[j], maxiter)

return (r1,r2,n3)

%timeit mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)

78.3 ms ± 56.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
2.02 s ± 873 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')


スポンサーリンク

## マンデルブロ描画(numba guvectorize編)¶

import numpy as np
from numba import jit, vectorize, guvectorize, float64, complex64, int32, float32

@jit(int32(complex64, int32))
def mandelbrot(c,maxiter):
nreal = 0
real = 0
imag = 0
for n in range(maxiter):
nreal = real*real - imag*imag + c.real
imag = 2* real*imag + c.imag
real = nreal;
if real * real + imag * imag > 4.0:
return n
return 0

@guvectorize([(complex64[:], int32[:], int32[:])], '(n),()->(n)',target='parallel')
def mandelbrot_numpy(c, maxit, output):
maxiter = maxit[0]
for i in range(c.shape[0]):
output[i] = mandelbrot(c[i],maxiter)

def mandelbrot_set2(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
n3 = mandelbrot_numpy(c,maxiter)
return (r1,r2,n3.T)

%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set2(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)

17.6 ms ± 247 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
336 ms ± 4.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

mandelbrot_set = mandelbrot_set2
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')

2.02/.336

6.011904761904762

numba guvectorizeはcythonの6倍も高速だった。

### マンデルブロ描画(numba guvectorize cuda編)¶

import numpy as np
from numba import jit, vectorize, guvectorize, float64, complex64, int32, float32

@jit(int32(complex64, int32))
def mandelbrot(c,maxiter):
creal = c.real
cimag = c.imag
real = creal
imag = cimag
for n in range(maxiter):
real2 = real*real
imag2 = imag*imag
if real2 + imag2 > 4.0:
return n
imag = 2* real*imag + cimag
real = real2 - imag2 + creal

return 0

@guvectorize([(complex64[:], int32[:], int32[:])], '(n),(n)->(n)', target='cuda')
def mandelbrot_numpy(c, maxit, output):
maxiter = maxit[0]
for i in range(c.shape[0]):
creal = c[i].real
cimag = c[i].imag
real = creal
imag = cimag
output[i] = 0
for n in range(maxiter):
real2 = real*real
imag2 = imag*imag
if real2 + imag2 > 4.0:
output[i] = n
break
imag = 2* real*imag + cimag
real = real2 - imag2 + creal

def mandelbrot_set2(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
n3 = np.empty(c.shape, int)
maxit = np.ones(c.shape, int) * maxiter
n3 = mandelbrot_numpy(c,maxit)
return (r1,r2,n3.T)

%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set2(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)

24.9 ms ± 493 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
343 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

mandelbrot_set = mandelbrot_set2
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')


スポンサーリンク

## マンデルブロ集合描画(PyOpenCL編)¶

!conda install -y -c conda-forge pyopencl

Solving environment: done

## Package Plan ##

environment location: /root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2

- pyopencl

The following NEW packages will be INSTALLED:

appdirs:         1.4.3-py_1        conda-forge
mako:            1.0.7-py36_0      conda-forge
ocl-icd:         2.2.9-4           conda-forge
pyopencl:        2018.1.1-py36_2   conda-forge
pytools:         2018.4-py_0       conda-forge

The following packages will be UPDATED:

ca-certificates: 2018.03.07-0                  --> 2018.4.16-0      conda-forge
certifi:         2018.4.16-py36_0              --> 2018.4.16-py36_0 conda-forge
conda:           4.5.4-py36_0                  --> 4.5.4-py36_0     conda-forge
openssl:         1.0.2o-h20670df_0             --> 1.0.2o-0         conda-forge

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

from __future__ import absolute_import
from __future__ import print_function

import pyopencl as cl

%reload_ext pyopencl.ipython_ext

ctx = cl.create_some_context(interactive=True)
devices = ctx.get_info(cl.context_info.DEVICES)
print(devices)

Choose platform:
[0] <pyopencl.Platform 'Experimental OpenCL 2.1 CPU Only Platform' at 0x561ba22a0b10>
Choice [0]:y
Set the environment variable PYOPENCL_CTX='y' to avoid being asked again.
[<pyopencl.Device 'Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz' on 'Experimental OpenCL 2.1 CPU Only Platform' at 0x561ba25b5208>]

def mandelbrot_gpu(q, maxiter):

global ctx

queue = cl.CommandQueue(ctx)

output = np.empty(q.shape, dtype=np.uint16)

prg = cl.Program(ctx, """
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
__kernel void mandelbrot(__global float2 *q,
__global ushort *output, ushort const maxiter)
{
int gid = get_global_id(0);
float nreal, real = 0;
float imag = 0;
output[gid] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
nreal = real*real - imag*imag + q[gid].x;
imag = 2* real*imag + q[gid].y;
real = nreal;
if (real*real + imag*imag > 4.0f){
output[gid] = curiter;
break;
}
}
}
""").build()

mf = cl.mem_flags
q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)

prg.mandelbrot(queue, output.shape, None, q_opencl,
output_opencl, np.uint16(maxiter))

cl.enqueue_copy(queue, output, output_opencl).wait()

return output

def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
c = np.ravel(c)
n3 = mandelbrot_gpu(c,maxiter)
n3 = n3.reshape((width,height))
return (r1,r2,n3.T)

mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')

/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
"to see more.", CompilerWarning)

mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')

/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
"to see more.", CompilerWarning)

%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)

/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
"to see more.", CompilerWarning)

8.88 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
"to see more.", CompilerWarning)

58.1 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

.336/.0581

5.783132530120483

pyopenclは、cythonの6倍高速だったnumba guvectorizeの5.8倍の速度だったので、つまり、cythonの約35倍という驚異的な結果になった。python版と比較した場合、1480倍というとんでもない数字になった。

スポンサーリンク

## マンデルブロ集合描画(pycuda編)¶

pycuda版のコードにbugがあるのかどうかは分からないが、マンデルブロ集合を描画することができなかったのでボツにした。bug fixを何度か試みたが、私の手に負えるわけもなく、敢え無く断念した。ただ引用元にも書いてあるように、pyopencl(cpu)版が最速なので非常に興味深い結果になった。

スポンサーリンク
スポンサーリンク

フォローする