numpy, pycuda, sk-cudaでdot product(内積)速度比較

numpy(numpy.dot), pycuda, sk-cuda(culinalg.dot)のdot product計算速度比較をする。pycudaはSourceModuleバージョンを使用する。

スポンサーリンク

ベンチマーク用コードを作成

import pycuda.autoinit
import numpy as np
import pycuda.gpuarray as gpuarray
import time
from pycuda.compiler import SourceModule
import skcuda.linalg as culinalg
import skcuda
culinalg.init()

mod = SourceModule("""    
__global__ void dot(float* C, float* A, float* B, int width) {
__shared__ float Ashare[16][16];
__shared__ float Bshare[16][16];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * 16 + ty;
int col = bx * 16 + tx;
float result = 0;
for (int m = 0; m < width / 16; m++) {
//collectively load the A and B tiles into shared memory
Ashare[ty][tx] = A[(row * width) + (m * 16) + tx];
Bshare[ty][tx] = B[(((m * 16) + ty) * width) + col];
__syncthreads(); //wait for all the shared memory to be loaded

for (int k = 0; k < 16; k++) {
result += Ashare[ty][k] * Bshare[k][tx];
}
__syncthreads();
}
    C[(row * width) + col] = result;
}
""")
results = []
for mat_width in [2**8,2**9,2**10,2**11,2**12,2**13,2**14]:
#mat_width = 256
    nb_columnsA=mat_width
    nb_linesA=mat_width
    nb_columnsB=mat_width
    nb_linesB=nb_columnsA
    a=np.random.rand(nb_linesA,nb_columnsA).astype(np.float32)
    b=np.random.rand(nb_linesB,nb_columnsB).astype(np.float32)
    
    start = time.time()
    numpy_dot = np.dot(a,b)
    end = time.time()
    sec1 = end - start
    print ("CPU time", sec1)
    #print ("numpy_dot", numpy_dot)

    start = time.time()
    knl = mod.get_function("dot")
    warp_size=32
    a_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
    b_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
    c_gpu = gpuarray.empty((nb_linesA, nb_columnsB), np.float32)
    threadPerBlockx=16
    threadPerBlocky=16
    size_Cx = nb_columnsB
    size_Cy = nb_linesA
    BlockPerGridx = (int) (size_Cx // threadPerBlockx);
    BlockPerGridy = (int) (size_Cy // threadPerBlockx);
    knl(c_gpu, a_gpu, b_gpu, np.int32(mat_width),\
       block = (threadPerBlockx, threadPerBlocky, 1), grid=(BlockPerGridx,BlockPerGridy))
    end = time.time()
    sec2 = end - start
    print ("GPU time", sec2)
    print('Speed difference of numpy and pycuda: {:0.1f}x'.format(sec1 / sec2))
    print (np.allclose(c_gpu.get(),numpy_dot))
    start = time.time()
    ressk = culinalg.dot(a_gpu, b_gpu)
    end = time.time()
    sec3 = end - start
    ressk = ressk.get()
    print ("SKC time", sec3)
    print('Speed difference of pycuda and skcuda: {:0.1f}x'.format(sec3 / sec2))
    print (np.allclose(numpy_dot,ressk))
    results.append([mat_width,sec1,sec2,sec3])
CPU time 0.0043294429779052734
GPU time 9.131431579589844e-05
Speed difference of numpy and pycuda: 47.4x
True
SKC time 0.004269838333129883
Speed difference of pycuda and skcuda: 46.8x
True
CPU time 0.000993967056274414
GPU time 9.703636169433594e-05
Speed difference of numpy and pycuda: 10.2x
True
SKC time 0.00018310546875
Speed difference of pycuda and skcuda: 1.9x
True
CPU time 0.009647369384765625
GPU time 9.703636169433594e-05
Speed difference of numpy and pycuda: 99.4x
True
SKC time 0.0004210472106933594
Speed difference of pycuda and skcuda: 4.3x
True
CPU time 0.062194108963012695
GPU time 0.00011658668518066406
Speed difference of numpy and pycuda: 533.5x
True
SKC time 0.0003712177276611328
Speed difference of pycuda and skcuda: 3.2x
True
CPU time 0.45549845695495605
GPU time 8.678436279296875e-05
Speed difference of numpy and pycuda: 5248.6x
True
SKC time 0.00034332275390625
Speed difference of pycuda and skcuda: 4.0x
True
CPU time 6.077332258224487
GPU time 0.00010204315185546875
Speed difference of numpy and pycuda: 59556.5x
True
SKC time 0.004248857498168945
Speed difference of pycuda and skcuda: 41.6x
True
CPU time 49.19446063041687
GPU time 0.0007884502410888672
Speed difference of numpy and pycuda: 62393.9x
True
SKC time 0.013907909393310547
Speed difference of pycuda and skcuda: 17.6x
True
print(c_gpu.size, ressk.size, numpy_dot.size)
268435456 268435456 268435456

ベンチマーク結果をグラフ化

import matplotlib.pyplot as plt
import numpy as np

results = np.array(results)
legends = []
nH = results[:7, 0:1]
rows = results[:7,1:6]
plt.semilogx(nH,rows, 'o-')
legends += ['' + s for s in ['CPU','GPU','SK-CUDA']]
plt.rcParams['figure.figsize'] = 18, 10
plt.rcParams["font.size"] = "20"
plt.ylabel('Seconds')
plt.xlabel('Value of mat_width')
plt.legend(legends);

mat_widthが16384の場合、numpyとpycudaの速度差が62393.9倍で、pycudaとskcudaの速度差は17.6倍だった。mat_widthが8192だと、numpyとpycudaの速度差は59556.5倍で、pycudaとskcudaの速度差は41.6倍だった。pycuda速過ぎの結果となっている。mat_widthが、$2^{15}=32768$になるとパソコンが固まる。やはり、メインメモリ64G、GPUメモリ16Gは欲しいところだ。

pycudaのtotal timeで比較する

実際には、pycudaの場合、host→deviceのデータ転送時間も含める必要があるらしい。上記の速度比較は、そのデータ転送を含めていないので、ある意味、インチキと言えるかもしれないが、pureなpycudaの速度とも言えるので、必ずしもインチキとも言えない。

import pycuda.autoinit
import numpy as np
import pycuda.gpuarray as gpuarray
import time
from pycuda.compiler import SourceModule
import skcuda.linalg as culinalg
import skcuda
culinalg.init()

mod = SourceModule("""    
__global__ void dot(float* C, float* A, float* B, int width) {
__shared__ float Ashare[16][16];
__shared__ float Bshare[16][16];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * 16 + ty;
int col = bx * 16 + tx;
float result = 0;
for (int m = 0; m < width / 16; m++) {
//collectively load the A and B tiles into shared memory
Ashare[ty][tx] = A[(row * width) + (m * 16) + tx];
Bshare[ty][tx] = B[(((m * 16) + ty) * width) + col];
__syncthreads(); //wait for all the shared memory to be loaded

for (int k = 0; k < 16; k++) {
result += Ashare[ty][k] * Bshare[k][tx];
}
__syncthreads();
}
    C[(row * width) + col] = result;
}
""")
knl = mod.get_function("dot")
warp_size=32

results = []
for mat_width in [2**8,2**9,2**10,2**11,2**12,2**13,2**14]:
#mat_width = 256
    nb_columnsA=mat_width
    nb_linesA=mat_width
    nb_columnsB=mat_width
    nb_linesB=nb_columnsA
    a=np.random.rand(nb_linesA,nb_columnsA).astype(np.float32)
    b=np.random.rand(nb_linesB,nb_columnsB).astype(np.float32)
    
    start = time.time()
    numpy_dot = np.dot(a,b)
    end = time.time()
    sec1 = end - start
    print ("CPU time", sec1)
    #print ("numpy_dot", numpy_dot)

    start = time.time()
    a_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
    b_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
    c_gpu = gpuarray.empty((nb_linesA, nb_columnsB), np.float32)
    threadPerBlockx=16
    threadPerBlocky=16
    size_Cx = nb_columnsB
    size_Cy = nb_linesA
    BlockPerGridx = (int) (size_Cx // threadPerBlockx);
    BlockPerGridy = (int) (size_Cy // threadPerBlockx);
    knl(c_gpu, a_gpu, b_gpu, np.int32(mat_width),\
       block = (threadPerBlockx, threadPerBlocky, 1), grid=(BlockPerGridx,BlockPerGridy))
    end = time.time()
    sec2 = end - start
    print ("GPU time", sec2)
    print('Speed difference of numpy and pycuda: {:0.1f}x'.format(sec1 / sec2))
    print (np.allclose(c_gpu.get(),numpy_dot))
    
    start = time.time()
    ressk = culinalg.dot(a_gpu, b_gpu)
    end = time.time()
    sec3 = end - start
    ressk = ressk.get()
    print ("SKC time", sec3)
    print('Speed difference of pycuda and skcuda: {:0.1f}x'.format(sec3 / sec2))
    print (np.allclose(numpy_dot,ressk))
    results.append([mat_width,sec1,sec2,sec3])
CPU time 0.005606889724731445
GPU time 0.005663394927978516
Speed difference of numpy and pycuda: 1.0x
True
SKC time 0.0043337345123291016
Speed difference of pycuda and skcuda: 0.8x
True
CPU time 0.0016651153564453125
GPU time 0.0015158653259277344
Speed difference of numpy and pycuda: 1.1x
True
SKC time 0.0001614093780517578
Speed difference of pycuda and skcuda: 0.1x
True
CPU time 0.007543087005615234
GPU time 0.0027222633361816406
Speed difference of numpy and pycuda: 2.8x
True
SKC time 0.0004000663757324219
Speed difference of pycuda and skcuda: 0.1x
True
CPU time 0.06894278526306152
GPU time 0.015372991561889648
Speed difference of numpy and pycuda: 4.5x
True
SKC time 0.00031375885009765625
Speed difference of pycuda and skcuda: 0.0x
True
CPU time 0.45606541633605957
GPU time 0.08351349830627441
Speed difference of numpy and pycuda: 5.5x
True
SKC time 0.0003459453582763672
Speed difference of pycuda and skcuda: 0.0x
True
CPU time 6.098830699920654
GPU time 0.30339813232421875
Speed difference of numpy and pycuda: 20.1x
True
SKC time 0.004012346267700195
Speed difference of pycuda and skcuda: 0.0x
True
CPU time 47.58618664741516
GPU time 1.0938682556152344
Speed difference of numpy and pycuda: 43.5x
True
SKC time 0.013898372650146484
Speed difference of pycuda and skcuda: 0.0x
True
print(c_gpu.size, ressk.size, numpy_dot.size)
268435456 268435456 268435456

ホスト→デバイスのデータ転送速度を含めると、CPU(numpy)とpycudaの速度差が劇的に縮まる一方で、skcudaがpycudaよりも高速という結果になった。mat_widthが、16384の場合、skcudaとpycudaの速度差は78.7倍だった。まぁ、確かに速度差が5万倍とか、あまりにも差が有りすぎるのは何かおかしいと思ってはいた。

skcudaもtotal timeで比較する

しかし、よくよく考えると、scikit-cudaの方もホスト→デバイスのデータ転送時間がタイミングデータに反映されていないということに気付いたので、改めて速度比較をやり直した。

import pycuda.autoinit
import numpy as np
import pycuda.gpuarray as gpuarray
import time
from pycuda.compiler import SourceModule
import skcuda.linalg as culinalg
import skcuda
culinalg.init()

mod = SourceModule("""    
__global__ void dot(float* C, float* A, float* B, int width) {
__shared__ float Ashare[16][16];
__shared__ float Bshare[16][16];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * 16 + ty;
int col = bx * 16 + tx;
float result = 0;
for (int m = 0; m < width / 16; m++) {
//collectively load the A and B tiles into shared memory
Ashare[ty][tx] = A[(row * width) + (m * 16) + tx];
Bshare[ty][tx] = B[(((m * 16) + ty) * width) + col];
__syncthreads(); //wait for all the shared memory to be loaded

for (int k = 0; k < 16; k++) {
result += Ashare[ty][k] * Bshare[k][tx];
}
__syncthreads();
}
    C[(row * width) + col] = result;
}
""")
knl = mod.get_function("dot")
warp_size=32

results = []
for mat_width in [2**8,2**9,2**10,2**11,2**12,2**13,2**14]:
#mat_width = 256
    nb_columnsA=mat_width
    nb_linesA=mat_width
    nb_columnsB=mat_width
    nb_linesB=nb_columnsA
    a=np.random.rand(nb_linesA,nb_columnsA).astype(np.float32)
    b=np.random.rand(nb_linesB,nb_columnsB).astype(np.float32)
    
    start = time.time()
    numpy_dot = np.dot(a,b)
    end = time.time()
    sec1 = end - start
    print ("CPU time", sec1)
    #print ("numpy_dot", numpy_dot)

    start = time.time()
    a_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
    b_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
    c_gpu = gpuarray.empty((nb_linesA, nb_columnsB), np.float32)
    threadPerBlockx=16
    threadPerBlocky=16
    size_Cx = nb_columnsB
    size_Cy = nb_linesA
    BlockPerGridx = (int) (size_Cx // threadPerBlockx);
    BlockPerGridy = (int) (size_Cy // threadPerBlockx);
    knl(c_gpu, a_gpu, b_gpu, np.int32(mat_width),\
       block = (threadPerBlockx, threadPerBlocky, 1), grid=(BlockPerGridx,BlockPerGridy))
    end = time.time()
    sec2 = end - start
    print ("GPU time", sec2)
    print('Speed difference of numpy and pycuda: {:0.1f}x'.format(sec1 / sec2))
    print (np.allclose(c_gpu.get(),numpy_dot))
    
    start = time.time()
    c_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
    d_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
    ressk = culinalg.dot(c_gpu, d_gpu)
    end = time.time()
    sec3 = end - start
    ressk = ressk.get()
    print ("SKC time", sec3)
    print('Speed difference of pycuda and skcuda: {:0.1f}x'.format(sec2 / sec3))
    print (np.allclose(numpy_dot,ressk))
    results.append([mat_width,sec1,sec2,sec3])
CPU time 0.005476713180541992
GPU time 0.005797863006591797
Speed difference of numpy and pycuda: 0.9x
True
SKC time 0.00584864616394043
Speed difference of pycuda and skcuda: 1.0x
True
CPU time 0.0016560554504394531
GPU time 0.0013129711151123047
Speed difference of numpy and pycuda: 1.3x
True
SKC time 0.0010714530944824219
Speed difference of pycuda and skcuda: 1.2x
True
CPU time 0.00834798812866211
GPU time 0.0027348995208740234
Speed difference of numpy and pycuda: 3.1x
True
SKC time 0.002827167510986328
Speed difference of pycuda and skcuda: 1.0x
True
CPU time 0.05762672424316406
GPU time 0.013735532760620117
Speed difference of numpy and pycuda: 4.2x
True
SKC time 0.007689952850341797
Speed difference of pycuda and skcuda: 1.8x
True
CPU time 0.4263160228729248
GPU time 0.08592534065246582
Speed difference of numpy and pycuda: 5.0x
True
SKC time 0.07005167007446289
Speed difference of pycuda and skcuda: 1.2x
True
CPU time 5.90757942199707
GPU time 0.3042469024658203
Speed difference of numpy and pycuda: 19.4x
True
SKC time 0.27747559547424316
Speed difference of pycuda and skcuda: 1.1x
True
CPU time 48.462236642837524
GPU time 1.0852441787719727
Speed difference of numpy and pycuda: 44.7x
True
SKC time 1.104407548904419
Speed difference of pycuda and skcuda: 1.0x
True

結局、pycudaとskcudaの内積計算速度はほぼ同じという結果になった。ということは、過去の速度比較の結果が正確でないということになるので、何ともやるせない。

真の速度比較

真の速度比較には、データの取り出し(.get())もtimingに含めないといけないらしいので、それらも含めた真のdot product速度比較をしてみた。

import pycuda.autoinit
import numpy as np
import pycuda.gpuarray as gpuarray
import time
import cupy
from pycuda.compiler import SourceModule
import skcuda.linalg as culinalg
import skcuda
culinalg.init()

mod = SourceModule("""    
__global__ void dot(float* C, float* A, float* B, int width) {
__shared__ float Ashare[16][16];
__shared__ float Bshare[16][16];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * 16 + ty;
int col = bx * 16 + tx;
float result = 0;
for (int m = 0; m < width / 16; m++) {
//collectively load the A and B tiles into shared memory
Ashare[ty][tx] = A[(row * width) + (m * 16) + tx];
Bshare[ty][tx] = B[(((m * 16) + ty) * width) + col];
__syncthreads(); //wait for all the shared memory to be loaded

for (int k = 0; k < 16; k++) {
result += Ashare[ty][k] * Bshare[k][tx];
}
__syncthreads();
}
    C[(row * width) + col] = result;
}
""")
knl = mod.get_function("dot")
warp_size=32

results = []
for mat_width in [2**8,2**9,2**10,2**11,2**12,2**13,2**14]:
#mat_width = 256
    nb_columnsA=mat_width
    nb_linesA=mat_width
    nb_columnsB=mat_width
    nb_linesB=nb_columnsA
    a=np.random.rand(nb_linesA,nb_columnsA).astype(np.float32)
    b=np.random.rand(nb_linesB,nb_columnsB).astype(np.float32)
    
    start = time.time()
    numpy_dot = np.dot(a,b)
    end = time.time()
    sec1 = end - start
    print ("CPU time", sec1)
    #print ("numpy_dot", numpy_dot)

    start = time.time()
    a_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
    b_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
    c_gpu = gpuarray.empty((nb_linesA, nb_columnsB), np.float32)
    threadPerBlockx=16
    threadPerBlocky=16
    size_Cx = nb_columnsB
    size_Cy = nb_linesA
    BlockPerGridx = (int) (size_Cx // threadPerBlockx);
    BlockPerGridy = (int) (size_Cy // threadPerBlockx);
    knl(c_gpu, a_gpu, b_gpu, np.int32(mat_width),\
       block = (threadPerBlockx, threadPerBlocky, 1), grid=(BlockPerGridx,BlockPerGridy))
    pycuda_dot = c_gpu.get()
    end = time.time()
    sec2 = end - start
    print ("GPU time", sec2)
    print('Speed difference of numpy and pycuda: {:0.1f}x'.format(sec1 / sec2))
    print (np.allclose(pycuda_dot,numpy_dot))
    
    start = time.time()
    c_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
    d_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
    ressk = culinalg.dot(c_gpu, d_gpu)
    ressk = ressk.get()
    end = time.time()
    sec3 = end - start    
    print ("SKC time", sec3)
    print('Speed difference of pycuda and skcuda: {:0.1f}x'.format(sec2 / sec3))
    print (np.allclose(numpy_dot,ressk))
    results.append([mat_width,sec1,sec2,sec3])
CPU time 0.003786802291870117
GPU time 0.00936269760131836
Speed difference of numpy and pycuda: 0.4x
True
SKC time 0.004641532897949219
Speed difference of pycuda and skcuda: 2.0x
True
CPU time 0.0016567707061767578
GPU time 0.0021271705627441406
Speed difference of numpy and pycuda: 0.8x
True
SKC time 0.0014183521270751953
Speed difference of pycuda and skcuda: 1.5x
True
CPU time 0.006645679473876953
GPU time 0.008651018142700195
Speed difference of numpy and pycuda: 0.8x
True
SKC time 0.004537820816040039
Speed difference of pycuda and skcuda: 1.9x
True
CPU time 0.06942224502563477
GPU time 0.05013275146484375
Speed difference of numpy and pycuda: 1.4x
True
SKC time 0.0210115909576416
Speed difference of pycuda and skcuda: 2.4x
True
CPU time 0.541370153427124
GPU time 0.44753575325012207
Speed difference of numpy and pycuda: 1.2x
True
SKC time 0.09866809844970703
Speed difference of pycuda and skcuda: 4.5x
True
CPU time 6.792598009109497
GPU time 3.2153449058532715
Speed difference of numpy and pycuda: 2.1x
True
SKC time 0.6954827308654785
Speed difference of pycuda and skcuda: 4.6x
True
CPU time 54.40831637382507
GPU time 23.86566948890686
Speed difference of numpy and pycuda: 2.3x
True
SKC time 4.154383182525635
Speed difference of pycuda and skcuda: 5.7x
True
import matplotlib.pyplot as plt
import numpy as np

results = np.array(results)
legends = []
nH = results[:7, 0:1]
rows = results[:7,1:6]
plt.semilogx(nH,rows, 'o-')
legends += ['' + s for s in ['CPU','GPU','SK-CUDA','cupy']]
plt.rcParams['figure.figsize'] = 18, 10
plt.rcParams["font.size"] = "20"
plt.ylabel('Seconds')
plt.xlabel('Value of mat_width')
plt.legend(legends);

実際の速度は、numpyとpycudaにそれ程差はなく、numpyとskcudaの速度差が13倍だった。結論としては、内積計算にはscikit-cudaを使うのがベストのようだ。


参考サイトhttps://devtalk.nvidia.com/