どう考えてもGPUよりCPUの方が実行速度が速いわけがないので、汚名挽回の意味も含めて、pycudaとpyopenclのガチ対決をしてみた。
スポンサーリンク
マンデルブロ集合描画(PyOpenCL編)¶
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import colors
%matplotlib inline
def mandelbrot_image(xmin,xmax,ymin,ymax,width=3,height=3,maxiter=80,cmap='hot'):
dpi = 72
img_width = dpi * width
img_height = dpi * height
x,y,z = mandelbrot_set(xmin,xmax,ymin,ymax,img_width,img_height,maxiter)
fig, ax = plt.subplots(figsize=(width, height),dpi=72)
ticks = np.arange(0,img_width,3*dpi)
x_ticks = xmin + (xmax-xmin)*ticks/img_width
plt.xticks(ticks, x_ticks)
y_ticks = ymin + (ymax-ymin)*ticks/img_width
plt.yticks(ticks, y_ticks)
norm = colors.PowerNorm(0.3)
ax.imshow(z.T,cmap=cmap,origin='lower',norm=norm)
from __future__ import absolute_import
from __future__ import print_function
import pyopencl as cl
%load_ext pyopencl.ipython_ext
ctx = cl.create_some_context(interactive=True)
devices = ctx.get_info(cl.context_info.DEVICES)
print(devices)
def mandelbrot_gpu(q, maxiter):
global ctx
queue = cl.CommandQueue(ctx)
output = np.empty(q.shape, dtype=np.uint16)
prg = cl.Program(ctx, """
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
__kernel void mandelbrot(__global float2 *q,
__global ushort *output, ushort const maxiter)
{
int gid = get_global_id(0);
float nreal, real = 0;
float imag = 0;
output[gid] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
nreal = real*real - imag*imag + q[gid].x;
imag = 2* real*imag + q[gid].y;
real = nreal;
if (real*real + imag*imag > 4.0f){
output[gid] = curiter;
break;
}
}
}
""").build()
mf = cl.mem_flags
q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)
prg.mandelbrot(queue, output.shape, None, q_opencl,
output_opencl, np.uint16(maxiter))
cl.enqueue_copy(queue, output, output_opencl).wait()
return output
def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
c = np.ravel(c)
n3 = mandelbrot_gpu(c,maxiter)
n3 = n3.reshape((width,height))
return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
マンデルブロー描画(pyopencl他バージョン)¶
def mandelbrot_gpu(q, maxiter):
global ctx
queue = cl.CommandQueue(ctx)
output = np.empty(q.shape, dtype=np.uint16)
prg = cl.Program(ctx, """
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
__kernel void mandelbrot(__global float2 *q,
__global ushort *output, ushort const maxiter)
{
int gid = get_global_id(0);
float real = q[gid].x;
float imag = q[gid].y;
output[gid] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
float real2 = real*real, imag2 = imag*imag;
if (real*real + imag*imag > 4.0f){
output[gid] = curiter;
return;
}
imag = 2* real*imag + q[gid].y;
real = real2 - imag2 + q[gid].x;
}
}
""").build()
mf = cl.mem_flags
q_opencl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
output_opencl = cl.Buffer(ctx, mf.WRITE_ONLY, output.nbytes)
prg.mandelbrot(queue, output.shape, None, q_opencl,
output_opencl, np.uint16(maxiter))
cl.enqueue_copy(queue, output, output_opencl).wait()
return output
def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
c = np.ravel(c)
n3 = mandelbrot_gpu(c,maxiter)
n3 = n3.reshape((width,height))
return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
スポンサーリンク
マンデルブロー集合描画(PyCUDA編)¶
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel
complex_gpu = ElementwiseKernel(
"pycuda::complex<float> *q, int *output, int maxiter",
"""
{
float nreal, real = 0;
float imag = 0;
output[i] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
nreal = real*real - imag*imag + q[i].real();
imag = 2* real*imag + q[i].imag();
real = nreal;
if (real*real + imag*imag > 4.0f){
output[i] = curiter;
break;
};
};
}
""",
"complex5",
preamble="#include <pycuda-complex.hpp>",)
def mandelbrot_gpu(c, maxiter):
q_gpu = gpuarray.to_gpu(c.astype(np.complex64))
iterations_gpu = gpuarray.to_gpu(np.empty(c.shape, dtype=np.float32))
complex_gpu(q_gpu, iterations_gpu, maxiter)
return iterations_gpu.get()
def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
c = np.ravel(c)
n3 = mandelbrot_gpu(c,maxiter)
n3 = n3.reshape((width,height))
return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
マンデルブロ集合描画(PyCUDA/Version2)¶
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel
complex_gpu = ElementwiseKernel(
"pycuda::complex<float> *q, int *output, int maxiter",
"""
{
float nreal, real = 0;
float imag = 0;
output[i] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
float real2 = real*real;
float imag2 = imag*imag;
nreal = real2 - imag2 + q[i].real();
imag = 2* real*imag + q[i].imag();
real = nreal;
if (real2 + imag2 > 4.0f){
output[i] = curiter;
break;
};
};
}
""",
"complex5",
preamble="#include <pycuda-complex.hpp>",)
def mandelbrot_gpu(c, maxiter):
q_gpu = gpuarray.to_gpu(c.astype(np.complex64))
iterations_gpu = gpuarray.to_gpu(np.empty(c.shape, dtype=np.float32))
complex_gpu(q_gpu, iterations_gpu, maxiter)
return iterations_gpu.get()
def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
c = np.ravel(c)
n3 = mandelbrot_gpu(c,maxiter)
n3 = n3.reshape((width,height))
return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
マンデルブロー集合描画(PyCUDA/Version3)¶
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel
complex_gpu1 = ElementwiseKernel(
"pycuda::complex<float> *q, int *output, int maxiter",
"""
{
float real = q[i].real();
float imag = q[i].imag();
output[i] = 0;
for(int curiter = 0; curiter < maxiter; curiter++) {
float real2 = real*real;
float imag2 = imag*imag;
if (real2 + imag2 > 4.0f){
output[i] = curiter;
break;
};
imag = 2* real*imag + q[i].imag();
real = real2 - imag2 + q[i].real();
};
}
""",
"complex5",
preamble="#include <pycuda-complex.hpp>",)
def mandelbrot_gpu(c, maxiter):
q_gpu = gpuarray.to_gpu(c.astype(np.complex64))
iterations_gpu = gpuarray.to_gpu(np.empty(c.shape, dtype=np.float32))
complex_gpu(q_gpu, iterations_gpu, maxiter)
return iterations_gpu.get()
def mandelbrot_set3(xmin,xmax,ymin,ymax,width,height,maxiter):
r1 = np.linspace(xmin, xmax, width, dtype=np.float32)
r2 = np.linspace(ymin, ymax, height, dtype=np.float32)
c = r1 + r2[:,None]*1j
c = np.ravel(c)
n3 = mandelbrot_gpu(c,maxiter)
n3 = n3.reshape((width,height))
return (r1,r2,n3.T)
mandelbrot_set = mandelbrot_set3
mandelbrot_image(-2.0,0.5,-1.25,1.25,cmap='gnuplot2')
mandelbrot_image(-0.74877,-0.74872,0.06505,0.06510,maxiter=2048,cmap='gnuplot2')
%timeit mandelbrot_set3(-2.0,0.5,-1.25,1.25,1000,1000,80)
%timeit mandelbrot_set3(-0.74877,-0.74872,0.06505,0.06510,1000,1000,2048)
スポンサーリンク
マンデルブロ集合描画対決結果¶
60.6/15.8
pycuda版はpyopencl版の3.8倍速かった。そして、python版の5443倍という驚異的な数値を叩き出した。結果としてpycudaの学習をすべきということが良く分かった。
スポンサーリンク
スポンサーリンク