pyopencl vs cython vs numba guvectorize

python, numpy, numba, cython, pyopenclのマンデルブロ集合描画時間の計測をしてみたところ、非常に意外な結果になった。

import numpy as np
from matplotlib import pyplot as plt
from matplotlib import colors
%matplotlib inline

def mandelbrot_image(xmin,xmax,ymin,ymax,width=3,height=3,maxiter=80,cmap='hot'):
    dpi = 72
    img_width = dpi * width
    img_height = dpi * height
    x,y,z = mandelbrot_set(xmin,xmax,ymin,ymax,img_width,img_height,maxiter)
    
    fig, ax = plt.subplots(figsize=(width, height),dpi=72)
    ticks = np.arange(0,img_width,3*dpi)
    x_ticks = xmin + (xmax-xmin)*ticks/img_width
    plt.xticks(ticks, x_ticks)
    y_ticks = ymin + (ymax-ymin)*ticks/img_width
    plt.yticks(ticks, y_ticks)
    
    norm = colors.PowerNorm(0.3)
    ax.imshow(z.T,cmap=cmap,origin='lower',norm=norm)
def mandelbrot(z,maxiter):
    c = z
    for n in range(maxiter):
        if abs(z) > 2:
            return n
        z = z*z + c
    return maxiter

def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
    r1 = np.linspace(xmin, xmax, width)
    r2 = np.linspace(ymin, ymax, height)
    return (r1,r2,[mandelbrot(complex(r, i),maxiter) for r in r1 for i in r2])
%timeit -n1 -r1 mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit -n1 -r1 mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
3.48 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1min 26s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

マンデルブロ集合描画(Numpy編)

def mandelbrot(c,maxiter):
    z = c
    for n in range(maxiter):
        if abs(z) > 2:
            return n
        z = z*z + c
    return 0

def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
    r1 = np.linspace(xmin, xmax, width)
    r2 = np.linspace(ymin, ymax, height)
    n3 = np.empty((width,height))
    for i in range(width):
        for j in range(height):
            n3[i,j] = mandelbrot(r1[i] + 1j*r2[j],maxiter)
    return (r1,r2,n3)
%timeit -n1 -r1 mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit -n1 -r1 mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
9.22 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
3min 10s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')

マンデルブロ集合描画(numba編)

from numba import jit

@jit
def mandelbrot(c,maxiter):
    z = c
    for n in range(maxiter):
        if abs(z) > 2:
            return n
        z = z*z + c
    return 0

@jit
def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
    r1 = np.linspace(xmin, xmax, width)
    r2 = np.linspace(ymin, ymax, height)
    n3 = np.empty((width,height))
    for i in range(width):
        for j in range(height):
            n3[i,j] = mandelbrot(r1[i] + 1j*r2[j],maxiter)
    return (r1,r2,n3)
%timeit -n1 -r1 mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit -n1 -r1 mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
957 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
5.13 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')

マンデルブロー描画(numba.jit改良版)

absを使ってsquare root computation(平方根計算)を除去

@jit
def mandelbrot(c,maxiter):
    z = c
    for n in range(maxiter):
        if z.real * z.real + z.imag * z.imag > 4.0:
            return n
        z = z*z + c
    return 0

@jit
def mandelbrot_set(xmin,xmax,ymin,ymax,width,height,maxiter):
    r1 = np.linspace(xmin, xmax, width)
    r2 = np.linspace(ymin, ymax, height)
    n3 = np.empty((width,height))
    for i in range(width):
        for j in range(height):
            n3[i,j] = mandelbrot(r1[i] + 1j*r2[j],maxiter)
    return (r1,r2,n3)
%timeit -n1 -r1 mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit -n1 -r1 mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
242 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')

complexを2つのfloatに分解

@jit
def mandelbrot(creal,cimag,maxiter):
    real = creal
    imag = cimag
    for n in range(maxiter):
        real2 = real*real
        imag2 = imag*imag
        if real2 + imag2 > 4.0:
            return n
        imag = 2* real*imag + cimag
        real = real2 - imag2 + creal       
    return 0

@jit
def mandelbrot_set4(xmin,xmax,ymin,ymax,width,height,maxiter):
    r1 = np.linspace(xmin, xmax, width)
    r2 = np.linspace(ymin, ymax, height)
    n3 = np.empty((width,height))
    for i in range(width):
        for j in range(height):
            n3[i,j] = mandelbrot(r1[i],r2[j],maxiter)
    return (r1,r2,n3)
%timeit -n1 -r1 mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit -n1 -r1 mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
90.6 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')

マンデルブロー集合描画(cython編)

%load_ext cython
%%cython
import cython
import numpy as np

cdef int mandelbrot(double creal, double cimag, int maxiter):
    cdef:
        double real2, imag2
        double real = creal, imag = cimag
        int n

    for n in range(maxiter):
        real2 = real*real
        imag2 = imag*imag
        if real2 + imag2 > 4.0:
            return n
        imag = 2* real*imag + cimag
        real = real2 - imag2 + creal;
    return 0

@cython.boundscheck(False) 
@cython.wraparound(False)
cpdef mandelbrot_set(double xmin, double xmax, double ymin, double ymax, long width, long height, int maxiter):
    cdef:
        double[:] r1 = np.linspace(xmin, xmax, width)
        double[:] r2 = np.linspace(ymin, ymax, height)
        long[:,:] n3 = np.empty((width,height), np.int)
        int i,j
    
    for i in range(width):
        for j in range(height):
            n3[i,j] = mandelbrot(r1[i], r2[j], maxiter)
     
    return (r1,r2,n3)
%timeit mandelbrot_set(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
78.3 ms ± 56.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
2.02 s ± 873 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')

今回cythonはnumba.jitよりチョロっと速い結果に終わった。

マンデルブロ描画(numba guvectorize編)

import numpy as np
from numba import jit, vectorize, guvectorize, float64, complex64, int32, float32

@jit(int32(complex64, int32))
def mandelbrot(c,maxiter):
    nreal = 0
    real = 0
    imag = 0
    for n in range(maxiter):
        nreal = real*real - imag*imag + c.real
        imag = 2* real*imag + c.imag
        real = nreal;
        if real * real + imag * imag > 4.0:
            return n
    return 0

@guvectorize([(complex64[:], int32[:], int32[:])], '(n),()->(n)',target='parallel')
def mandelbrot_numpy(c, maxit, output):
    maxiter = maxit[0]
    for i in range(c.shape[0]):
        output[i] = mandelbrot(c[i],maxiter)
        
def mandelbrot_set2(xmin,xmax,ymin,ymax,width,height,maxiter):
    r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
    r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
    c = r1 + r2[:,None]*1j
    n3 = mandelbrot_numpy(c,maxiter)
    return (r1,r2,n3.T)
%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set2(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
17.6 ms ± 247 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
336 ms ± 4.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
mandelbrot_set = mandelbrot_set2
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
2.02/.336
6.011904761904762

numba guvectorizeはcythonの6倍も高速だった。

マンデルブロ描画(numba guvectorize cuda編)

import numpy as np
from numba import jit, vectorize, guvectorize, float64, complex64, int32, float32

@jit(int32(complex64, int32))
def mandelbrot(c,maxiter):
    creal = c.real
    cimag = c.imag
    real = creal
    imag = cimag
    for n in range(maxiter):
        real2 = real*real
        imag2 = imag*imag
        if real2 + imag2 > 4.0:
            return n
        imag = 2* real*imag + cimag
        real = real2 - imag2 + creal
        
    return 0

@guvectorize([(complex64[:], int32[:], int32[:])], '(n),(n)->(n)', target='cuda')
def mandelbrot_numpy(c, maxit, output):
    maxiter = maxit[0]
    for i in range(c.shape[0]):
        creal = c[i].real
        cimag = c[i].imag
        real = creal
        imag = cimag
        output[i] = 0
        for n in range(maxiter):
            real2 = real*real
            imag2 = imag*imag
            if real2 + imag2 > 4.0:
                output[i] = n
                break
            imag = 2* real*imag + cimag
            real = real2 - imag2 + creal            
        
def mandelbrot_set2(xmin,xmax,ymin,ymax,width,height,maxiter):
    r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
    r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
    c = r1 + r2[:,None]*1j
    n3 = np.empty(c.shape, int)
    maxit = np.ones(c.shape, int) * maxiter
    n3 = mandelbrot_numpy(c,maxit)
    return (r1,r2,n3.T)
%timeit mandelbrot_set2(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set2(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
24.9 ms ± 493 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
343 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
mandelbrot_set = mandelbrot_set2
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')

意外な事に、cpu版よりgpu版が遅い結果となった。

マンデルブロ集合描画(PyOpenCL編)

!conda install -y -c conda-forge pyopencl
Solving environment: done

## Package Plan ##

  environment location: /root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2

  added / updated specs: 
    - pyopencl


The following NEW packages will be INSTALLED:

    appdirs:         1.4.3-py_1        conda-forge
    mako:            1.0.7-py36_0      conda-forge
    ocl-icd:         2.2.9-4           conda-forge
    pyopencl:        2018.1.1-py36_2   conda-forge
    pytools:         2018.4-py_0       conda-forge

The following packages will be UPDATED:

    ca-certificates: 2018.03.07-0                  --> 2018.4.16-0      conda-forge
    certifi:         2018.4.16-py36_0              --> 2018.4.16-py36_0 conda-forge
    conda:           4.5.4-py36_0                  --> 4.5.4-py36_0     conda-forge
    openssl:         1.0.2o-h20670df_0             --> 1.0.2o-0         conda-forge

Preparing transaction: done
Verifying transaction: done
Executing transaction: done
from __future__ import absolute_import
from __future__ import print_function

import pyopencl as cl
%reload_ext pyopencl.ipython_ext
ctx = cl.create_some_context(interactive=True)
devices = ctx.get_info(cl.context_info.DEVICES)
print(devices)
Choose platform:
[0] <pyopencl.Platform 'Experimental OpenCL 2.1 CPU Only Platform' at 0x561ba22a0b10>
Choice [0]:y
Set the environment variable PYOPENCL_CTX='y' to avoid being asked again.
[<pyopencl.Device 'Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz' on 'Experimental OpenCL 2.1 CPU Only Platform' at 0x561ba25b5208>]
def mandelbrot_gpu(q, maxiter):

    global ctx
    
    queue = cl.CommandQueue(ctx)
    
    output = np.empty(q.shape, dtype=np.uint16)

    prg = cl.Program(ctx, """
    #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
    __kernel void mandelbrot(__global float2 *q,
                     __global ushort *output, ushort const maxiter)
    {
        int gid = get_global_id(0);
        float nreal, real = 0;
        float imag = 0;
        output[gid] = 0;
        for(int curiter = 0; curiter < maxiter; curiter++) {
            nreal = real*real - imag*imag + q[gid].x;
            imag = 2* real*imag + q[gid].y;
            real = nreal;
            if (real*real + imag*imag > 4.0f){
                 output[gid] = curiter;
                 break;
            }
        }
    }
    """).build()

    mf = cl.mem_flags
    q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
    output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)


    prg.mandelbrot(queue, output.shape, None, q_opencl,
                   output_opencl, np.uint16(maxiter))

    cl.enqueue_copy(queue, output, output_opencl).wait()
    
    return output

def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
    r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
    r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
    c = r1 + r2[:,None]*1j
    c = np.ravel(c)
    n3 = mandelbrot_gpu(c,maxiter)
    n3 = n3.reshape((width,height))
    return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
8.88 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
58.1 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
.336/.0581
5.783132530120483

pyopenclは、cythonの6倍高速だったnumba guvectorizeの5.8倍の速度だったので、つまり、cythonの約35倍という驚異的な結果になった。python版と比較した場合、1480倍というとんでもない数字になった。

マンデルブロ集合描画(pycuda編)

pycuda版のコードにbugがあるのかどうかは分からないが、マンデルブロ集合を描画することができなかったのでボツにした。bug fixを何度か試みたが、私の手に負えるわけもなく、敢え無く断念した。ただ引用元にも書いてあるように、pyopencl(cpu)版が最速なので非常に興味深い結果になった。

参考サイトhttps://github.com/