今日はこのサイトを参考にして、numpy.sin, cumath.sin, ElementwiseKernel, SourceModuleの速度比較をする。numpy.sinというのは、入力値に対する三角関数の正弦を要素毎に返す関数のことらしい。

速度比較用のコードをロード¶

import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
import numpy
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
import pycuda.cumath
from pycuda.elementwise import ElementwiseKernel

blocks = 128
block_size = 1024
start = drv.Event()
end = drv.Event()
        
mod = SourceModule("""
__global__ void gpusin(float *dest, float *a, int n_iter)
{
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
  for(int n = 0; n < n_iter; n++) {
    a[i] = sin(a[i]);
  }
  dest[i] = a[i];
}
""")
gpusin = mod.get_function("gpusin")

kernel = ElementwiseKernel(
   "float *a, int n_iter",
   "for(int n = 0; n < n_iter; n++) { a[i] = sin(a[i]);}",
   "gpusin")

knl = ElementwiseKernel(
   "float *a",
   "a[i] = sin(a[i]);",
   "gpusin")

results = []
for n_iter in [10**2, 10**3, 10**4, 10**5, 10**6]:
        nbr_values = blocks * block_size
        #print ("Using nbr_values ==", nbr_values)
        #Number of iterations for the calculations,
        # 100 is very quick, 2000000 will take a while
        print ("Calculating %d iterations" % (n_iter))        
        # create two timers so we can speed-test each approach
        ######################
        # SourceModele SECTION
        # We write the C code and the indexing and we have lots of control        
        # create an array of 1s
        a = numpy.ones(nbr_values).astype(numpy.float32)
        # create a destination array that will receive the result
        dest = numpy.zeros_like(a)        
        start.record() # start timing
        gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), \
               grid=(blocks,1), block=(block_size,1,1) )
        end.record() # end timing
        # calculate the run length
        end.synchronize()
        sec1 = start.time_till(end)*1e-3
        print ("SourceModule time and first three results:")
        print ("%fs, %s" % (sec1, str(dest[:3])))
        
        #####################
        # Elementwise SECTION
        # use an ElementwiseKernel with sin in a for loop all in C call from Python
        a = numpy.ones(nbr_values).astype(numpy.float32)
        a_gpu = gpuarray.to_gpu(a)
        start.record() # start timing
        kernel(a_gpu, numpy.int(n_iter))
        end.record() # end timing
        # calculate the run length
        end.synchronize()
        sec2 = start.time_till(end)*1e-3
        print ("Elementwise time and first three results:")
        print ("%fs, %s" % (sec2, str(a_gpu.get()[:3])))        
        
        ####################################
        # Elementwise Python looping SECTION
        # as Elementwise but the for loop is in Python, not in C        
        a = numpy.ones(nbr_values).astype(numpy.float32)
        a_gpu = gpuarray.to_gpu(a)
        start.record() # start timing
        for i in range(n_iter):
            knl(a_gpu)
        end.record() # end timing
        # calculate the run length
        end.synchronize()
        sec3 = start.time_till(end)*1e-3
        print ("Elementwise Python looping time and first three results:")
        print ("%fs, %s" % (sec3, str(a_gpu.get()[:3])))        
        
        ##################
        # GPUArray SECTION
        # The result is copied back to main memory on each iteration, this is a bottleneck
        
        a = numpy.ones(nbr_values).astype(numpy.float32)
        a_gpu = gpuarray.to_gpu(a)
        start.record() # start timing
        for i in range(n_iter):
            a_gpu = pycuda.cumath.sin(a_gpu)
        end.record() # end timing
        # calculate the run length
        end.synchronize()
        sec4 = start.time_till(end)*1e-3
        print ("GPUArray time and first three results:")
        print ("%fs, %s" % (sec4, str(a_gpu.get()[:3])))        
        
        #############
        # CPU SECTION
        # use numpy the calculate the result on the CPU for reference
        
        a = numpy.ones(nbr_values).astype(numpy.float32)
        start.record() # start timing
        start.synchronize()
        
        for i in range(n_iter):
            a = numpy.sin(a)        
        end.record() # end timing
        # calculate the run length
        end.synchronize()
        sec5 = start.time_till(end)*1e-3
        print ("CPU time and first three results:")
        print ("%fs, %s" % (sec5, str(a[:3])))
        results.append([n_iter,sec1,sec2,sec3,sec4,sec5])

Calculating 100 iterations
SourceModule time and first three results:
0.001147s, [0.16885252 0.16885252 0.16885252]
Elementwise time and first three results:
0.046318s, [0.16885252 0.16885252 0.16885252]
Elementwise Python looping time and first three results:
0.047763s, [0.16885252 0.16885252 0.16885252]
GPUArray time and first three results:
0.049422s, [0.16885252 0.16885252 0.16885252]
CPU time and first three results:
0.045375s, [0.1688525 0.1688525 0.1688525]
Calculating 1000 iterations
SourceModule time and first three results:
0.003369s, [0.05459302 0.05459302 0.05459302]
Elementwise time and first three results:
0.003031s, [0.05459302 0.05459302 0.05459302]
Elementwise Python looping time and first three results:
0.008498s, [0.05459302 0.05459302 0.05459302]
GPUArray time and first three results:
0.059103s, [0.05459302 0.05459302 0.05459302]
CPU time and first three results:
0.447246s, [0.054593 0.054593 0.054593]
Calculating 10000 iterations
SourceModule time and first three results:
0.030689s, [0.01731363 0.01731363 0.01731363]
Elementwise time and first three results:
0.027305s, [0.01731363 0.01731363 0.01731363]
Elementwise Python looping time and first three results:
0.081763s, [0.01731363 0.01731363 0.01731363]
GPUArray time and first three results:
0.970744s, [0.01731363 0.01731363 0.01731363]
CPU time and first three results:
4.465541s, [0.01731363 0.01731363 0.01731363]
Calculating 100000 iterations
SourceModule time and first three results:
0.328508s, [0.005477 0.005477 0.005477]
Elementwise time and first three results:
0.247968s, [0.005477 0.005477 0.005477]
Elementwise Python looping time and first three results:
0.818744s, [0.005477 0.005477 0.005477]
GPUArray time and first three results:
6.540062s, [0.005477 0.005477 0.005477]
CPU time and first three results:
43.554379s, [0.005477 0.005477 0.005477]
Calculating 1000000 iterations
SourceModule time and first three results:
2.299262s, [0.00173216 0.00173216 0.00173216]
Elementwise time and first three results:
2.060375s, [0.00173216 0.00173216 0.00173216]
Elementwise Python looping time and first three results:
7.982065s, [0.00173216 0.00173216 0.00173216]
GPUArray time and first three results:
35.678055s, [0.00173216 0.00173216 0.00173216]
CPU time and first three results:
436.231031s, [0.00173216 0.00173216 0.00173216]

pycuda圧勝¶

import matplotlib.pyplot as plt
import numpy as np

results = np.array(results)
legends = []
nH = results[:5, 0:1]
rows = results[:5,1:6]
plt.semilogx(nH,rows, 'o-')
legends += ['' + s for s in ['SourceModule','Elementwise',\
   'Elementwise Python looping','GPUArray','CPU']]
plt.rcParams['figure.figsize'] = 18, 10
plt.rcParams["font.size"] = "20"
plt.ylabel('Seconds')
plt.xlabel('Value of n_iter')
plt.legend(legends);

ループ数が100万回だと、pycudaのElementwiseとCPUの差が211.7倍に拡大する。ただ、意外だったのは、ElementwiseとElementwise Python loopingの差が3.87倍しかなかったことだ。ElementwiseとGPUArrayの差は17.3倍で、cumath.sinとnumpy.sinの差は12.2倍だった。