PyCUDA vs PyOpenCL ガチ対決!

どう考えてもGPUよりCPUの方が実行速度が速いわけがないので、汚名挽回の意味も含めて、pycudaとpyopenclのガチ対決をしてみた。

import numpy as np
from matplotlib import pyplot as plt
from matplotlib import colors
%matplotlib inline

def mandelbrot_image(xmin,xmax,ymin,ymax,width=3,height=3,maxiter=80,cmap='hot'):
    dpi = 72
    img_width = dpi * width
    img_height = dpi * height
    x,y,z = mandelbrot_set(xmin,xmax,ymin,ymax,img_width,img_height,maxiter)
    
    fig, ax = plt.subplots(figsize=(width, height),dpi=72)
    ticks = np.arange(0,img_width,3*dpi)
    x_ticks = xmin + (xmax-xmin)*ticks/img_width
    plt.xticks(ticks, x_ticks)
    y_ticks = ymin + (ymax-ymin)*ticks/img_width
    plt.yticks(ticks, y_ticks)
    
    norm = colors.PowerNorm(0.3)
    ax.imshow(z.T,cmap=cmap,origin='lower',norm=norm)
from __future__ import absolute_import
from __future__ import print_function

import pyopencl as cl
%load_ext pyopencl.ipython_ext
ctx = cl.create_some_context(interactive=True)
devices = ctx.get_info(cl.context_info.DEVICES)
print(devices)
Choose platform:
[0] <pyopencl.Platform 'Experimental OpenCL 2.1 CPU Only Platform' at 0x564760060d70>
Choice [0]:y
Set the environment variable PYOPENCL_CTX='y' to avoid being asked again.
[<pyopencl.Device 'Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz' on 'Experimental OpenCL 2.1 CPU Only Platform' at 0x56475ffe97e8>]
def mandelbrot_gpu(q, maxiter):

    global ctx
    
    queue = cl.CommandQueue(ctx)
    
    output = np.empty(q.shape, dtype=np.uint16)

    prg = cl.Program(ctx, """
    #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
    __kernel void mandelbrot(__global float2 *q,
                     __global ushort *output, ushort const maxiter)
    {
        int gid = get_global_id(0);
        float nreal, real = 0;
        float imag = 0;
        output[gid] = 0;
        for(int curiter = 0; curiter < maxiter; curiter++) {
            nreal = real*real - imag*imag + q[gid].x;
            imag = 2* real*imag + q[gid].y;
            real = nreal;
            if (real*real + imag*imag > 4.0f){
                 output[gid] = curiter;
                 break;
            }
        }
    }
    """).build()

    mf = cl.mem_flags
    q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
    output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)


    prg.mandelbrot(queue, output.shape, None, q_opencl,
                   output_opencl, np.uint16(maxiter))

    cl.enqueue_copy(queue, output, output_opencl).wait()
    
    return output

def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
    r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
    r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
    c = r1 + r2[:,None]*1j
    c = np.ravel(c)
    n3 = mandelbrot_gpu(c,maxiter)
    n3 = n3.reshape((width,height))
    return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
9.16 ms ± 270 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
60.6 ms ± 3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

マンデルブロー描画(pyopencl他バージョン)

def mandelbrot_gpu(q, maxiter):

    global ctx
    
    queue = cl.CommandQueue(ctx)
    
    output = np.empty(q.shape, dtype=np.uint16)

    prg = cl.Program(ctx, """
    #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
    __kernel void mandelbrot(__global float2 *q,
                     __global ushort *output, ushort const maxiter)
    {
        int gid = get_global_id(0);
        float real = q[gid].x;
        float imag = q[gid].y;
        output[gid] = 0;
        for(int curiter = 0; curiter < maxiter; curiter++) {
            float real2 = real*real, imag2 = imag*imag;
            if (real*real + imag*imag > 4.0f){
                 output[gid] = curiter;
                 return;
            }
            imag = 2* real*imag + q[gid].y;
            real = real2 - imag2 + q[gid].x;
            
        }
    }
    """).build()

    mf = cl.mem_flags
    q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
    output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)


    prg.mandelbrot(queue, output.shape, None, q_opencl,
                   output_opencl, np.uint16(maxiter))

    cl.enqueue_copy(queue, output, output_opencl).wait()
    
    return output

def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
    r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
    r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
    c = r1 + r2[:,None]*1j
    c = np.ravel(c)
    n3 = mandelbrot_gpu(c,maxiter)
    n3 = n3.reshape((width,height))
    return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
11.7 ms ± 166 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/pyopencl/cffi_cl.py:1521: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
  "to see more.", CompilerWarning)
136 ms ± 3.58 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

こっちのバージョンはGPUを使うと高速になり、逆にCPUは遅くなる。

マンデルブロー集合描画(PyCUDA編)

import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel

complex_gpu = ElementwiseKernel(
    "pycuda::complex<float> *q, int *output, int maxiter",
    """
    {
        float nreal, real = 0;
        float imag = 0;
        output[i] = 0;
        for(int curiter = 0; curiter < maxiter; curiter++) {
            nreal = real*real - imag*imag + q[i].real();
            imag = 2* real*imag + q[i].imag();
            real = nreal;
            if (real*real + imag*imag > 4.0f){
                output[i] = curiter;
                break;
                };
        };
    }
    """,
    "complex5",
    preamble="#include <pycuda-complex.hpp>",)

def mandelbrot_gpu(c, maxiter):
    q_gpu = gpuarray.to_gpu(c.astype(np.complex64))
    iterations_gpu = gpuarray.to_gpu(np.empty(c.shape, dtype=np.float32))
    complex_gpu(q_gpu, iterations_gpu, maxiter)
    return iterations_gpu.get()

def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
        r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
        r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
        c = r1 + r2[:,None]*1j
        c = np.ravel(c)
        n3 = mandelbrot_gpu(c,maxiter)
        n3 = n3.reshape((width,height))
        return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
8.45 ms ± 68.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
17.5 ms ± 592 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

マンデルブロ集合描画(PyCUDA/Version2)

import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel

complex_gpu = ElementwiseKernel(
    "pycuda::complex<float> *q, int *output, int maxiter",
    """
    {
        float nreal, real = 0;
        float imag = 0;
        output[i] = 0;
        for(int curiter = 0; curiter < maxiter; curiter++) {
            float real2 = real*real;
            float imag2 = imag*imag;
            nreal = real2 - imag2 + q[i].real();
            imag = 2* real*imag + q[i].imag();
            real = nreal;
            if (real2 + imag2 > 4.0f){
                output[i] = curiter;
                break;
                };
        };
    }
    """,
    "complex5",
    preamble="#include <pycuda-complex.hpp>",)

def mandelbrot_gpu(c, maxiter):
    q_gpu = gpuarray.to_gpu(c.astype(np.complex64))
    iterations_gpu = gpuarray.to_gpu(np.empty(c.shape, dtype=np.float32))
    complex_gpu(q_gpu, iterations_gpu, maxiter)
    return iterations_gpu.get()

def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
        r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
        r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
        c = r1 + r2[:,None]*1j
        c = np.ravel(c)
        n3 = mandelbrot_gpu(c,maxiter)
        n3 = n3.reshape((width,height))
        return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
8.41 ms ± 55.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
15.8 ms ± 84.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

マンデルブロー集合描画(PyCUDA/Version3)

import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel

complex_gpu1 = ElementwiseKernel(
    "pycuda::complex<float> *q, int *output, int maxiter",
    """
    {
        float real = q[i].real();
        float imag = q[i].imag();
        output[i] = 0;
        for(int curiter = 0; curiter < maxiter; curiter++) {
            float real2 = real*real;
            float imag2 = imag*imag;
            if (real2 + imag2 > 4.0f){
                output[i] = curiter;
                break;
            };
            imag = 2* real*imag + q[i].imag();
            real = real2 - imag2 + q[i].real();
         };
    }
    """,
    "complex5",
    preamble="#include <pycuda-complex.hpp>",)

def mandelbrot_gpu(c, maxiter):
    q_gpu = gpuarray.to_gpu(c.astype(np.complex64))
    iterations_gpu = gpuarray.to_gpu(np.empty(c.shape, dtype=np.float32))
    complex_gpu(q_gpu, iterations_gpu, maxiter)
    return iterations_gpu.get()

def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
        r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
        r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
        c = r1 + r2[:,None]*1j
        c = np.ravel(c)
        n3 = mandelbrot_gpu(c,maxiter)
        n3 = n3.reshape((width,height))
        return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
8.44 ms ± 119 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
15.8 ms ± 54.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

マンデルブロ集合描画対決結果

60.6/15.8
3.8354430379746836

pycuda版はpyopencl版の3.8倍速かった。そして、python版の5443倍という驚異的な数値を叩き出した。結果としてpycudaの学習をすべきということが良く分かった。

参考サイトhttps://github.com/