numpy(numpy.dot), pycuda, sk-cuda(culinalg.dot)のdot product計算速度比較をする。pycudaはSourceModuleバージョンを使用する。
ベンチマーク用コードを作成¶
import pycuda.autoinit
import numpy as np
import pycuda.gpuarray as gpuarray
import time
from pycuda.compiler import SourceModule
import skcuda.linalg as culinalg
import skcuda
culinalg.init()
mod = SourceModule("""
__global__ void dot(float* C, float* A, float* B, int width) {
__shared__ float Ashare[16][16];
__shared__ float Bshare[16][16];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * 16 + ty;
int col = bx * 16 + tx;
float result = 0;
for (int m = 0; m < width / 16; m++) {
//collectively load the A and B tiles into shared memory
Ashare[ty][tx] = A[(row * width) + (m * 16) + tx];
Bshare[ty][tx] = B[(((m * 16) + ty) * width) + col];
__syncthreads(); //wait for all the shared memory to be loaded
for (int k = 0; k < 16; k++) {
result += Ashare[ty][k] * Bshare[k][tx];
}
__syncthreads();
}
C[(row * width) + col] = result;
}
""")
results = []
for mat_width in [2**8,2**9,2**10,2**11,2**12,2**13,2**14]:
#mat_width = 256
nb_columnsA=mat_width
nb_linesA=mat_width
nb_columnsB=mat_width
nb_linesB=nb_columnsA
a=np.random.rand(nb_linesA,nb_columnsA).astype(np.float32)
b=np.random.rand(nb_linesB,nb_columnsB).astype(np.float32)
start = time.time()
numpy_dot = np.dot(a,b)
end = time.time()
sec1 = end - start
print ("CPU time", sec1)
#print ("numpy_dot", numpy_dot)
start = time.time()
knl = mod.get_function("dot")
warp_size=32
a_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
b_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
c_gpu = gpuarray.empty((nb_linesA, nb_columnsB), np.float32)
threadPerBlockx=16
threadPerBlocky=16
size_Cx = nb_columnsB
size_Cy = nb_linesA
BlockPerGridx = (int) (size_Cx // threadPerBlockx);
BlockPerGridy = (int) (size_Cy // threadPerBlockx);
knl(c_gpu, a_gpu, b_gpu, np.int32(mat_width),\
block = (threadPerBlockx, threadPerBlocky, 1), grid=(BlockPerGridx,BlockPerGridy))
end = time.time()
sec2 = end - start
print ("GPU time", sec2)
print('Speed difference of numpy and pycuda: {:0.1f}x'.format(sec1 / sec2))
print (np.allclose(c_gpu.get(),numpy_dot))
start = time.time()
ressk = culinalg.dot(a_gpu, b_gpu)
end = time.time()
sec3 = end - start
ressk = ressk.get()
print ("SKC time", sec3)
print('Speed difference of pycuda and skcuda: {:0.1f}x'.format(sec3 / sec2))
print (np.allclose(numpy_dot,ressk))
results.append([mat_width,sec1,sec2,sec3])
print(c_gpu.size, ressk.size, numpy_dot.size)
ベンチマーク結果をグラフ化¶
import matplotlib.pyplot as plt
import numpy as np
results = np.array(results)
legends = []
nH = results[:7, 0:1]
rows = results[:7,1:6]
plt.semilogx(nH,rows, 'o-')
legends += ['' + s for s in ['CPU','GPU','SK-CUDA']]
plt.rcParams['figure.figsize'] = 18, 10
plt.rcParams["font.size"] = "20"
plt.ylabel('Seconds')
plt.xlabel('Value of mat_width')
plt.legend(legends);
pycudaのtotal timeで比較する¶
実際には、pycudaの場合、host→deviceのデータ転送時間も含める必要があるらしい。上記の速度比較は、そのデータ転送を含めていないので、ある意味、インチキと言えるかもしれないが、pureなpycudaの速度とも言えるので、必ずしもインチキとも言えない。
import pycuda.autoinit
import numpy as np
import pycuda.gpuarray as gpuarray
import time
from pycuda.compiler import SourceModule
import skcuda.linalg as culinalg
import skcuda
culinalg.init()
mod = SourceModule("""
__global__ void dot(float* C, float* A, float* B, int width) {
__shared__ float Ashare[16][16];
__shared__ float Bshare[16][16];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * 16 + ty;
int col = bx * 16 + tx;
float result = 0;
for (int m = 0; m < width / 16; m++) {
//collectively load the A and B tiles into shared memory
Ashare[ty][tx] = A[(row * width) + (m * 16) + tx];
Bshare[ty][tx] = B[(((m * 16) + ty) * width) + col];
__syncthreads(); //wait for all the shared memory to be loaded
for (int k = 0; k < 16; k++) {
result += Ashare[ty][k] * Bshare[k][tx];
}
__syncthreads();
}
C[(row * width) + col] = result;
}
""")
knl = mod.get_function("dot")
warp_size=32
results = []
for mat_width in [2**8,2**9,2**10,2**11,2**12,2**13,2**14]:
#mat_width = 256
nb_columnsA=mat_width
nb_linesA=mat_width
nb_columnsB=mat_width
nb_linesB=nb_columnsA
a=np.random.rand(nb_linesA,nb_columnsA).astype(np.float32)
b=np.random.rand(nb_linesB,nb_columnsB).astype(np.float32)
start = time.time()
numpy_dot = np.dot(a,b)
end = time.time()
sec1 = end - start
print ("CPU time", sec1)
#print ("numpy_dot", numpy_dot)
start = time.time()
a_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
b_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
c_gpu = gpuarray.empty((nb_linesA, nb_columnsB), np.float32)
threadPerBlockx=16
threadPerBlocky=16
size_Cx = nb_columnsB
size_Cy = nb_linesA
BlockPerGridx = (int) (size_Cx // threadPerBlockx);
BlockPerGridy = (int) (size_Cy // threadPerBlockx);
knl(c_gpu, a_gpu, b_gpu, np.int32(mat_width),\
block = (threadPerBlockx, threadPerBlocky, 1), grid=(BlockPerGridx,BlockPerGridy))
end = time.time()
sec2 = end - start
print ("GPU time", sec2)
print('Speed difference of numpy and pycuda: {:0.1f}x'.format(sec1 / sec2))
print (np.allclose(c_gpu.get(),numpy_dot))
start = time.time()
ressk = culinalg.dot(a_gpu, b_gpu)
end = time.time()
sec3 = end - start
ressk = ressk.get()
print ("SKC time", sec3)
print('Speed difference of pycuda and skcuda: {:0.1f}x'.format(sec3 / sec2))
print (np.allclose(numpy_dot,ressk))
results.append([mat_width,sec1,sec2,sec3])
print(c_gpu.size, ressk.size, numpy_dot.size)
ホスト→デバイスのデータ転送速度を含めると、CPU(numpy)とpycudaの速度差が劇的に縮まる一方で、skcudaがpycudaよりも高速という結果になった。mat_widthが、16384の場合、skcudaとpycudaの速度差は78.7倍だった。まぁ、確かに速度差が5万倍とか、あまりにも差が有りすぎるのは何かおかしいと思ってはいた。
skcudaもtotal timeで比較する¶
しかし、よくよく考えると、scikit-cudaの方もホスト→デバイスのデータ転送時間がタイミングデータに反映されていないということに気付いたので、改めて速度比較をやり直した。
import pycuda.autoinit
import numpy as np
import pycuda.gpuarray as gpuarray
import time
from pycuda.compiler import SourceModule
import skcuda.linalg as culinalg
import skcuda
culinalg.init()
mod = SourceModule("""
__global__ void dot(float* C, float* A, float* B, int width) {
__shared__ float Ashare[16][16];
__shared__ float Bshare[16][16];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * 16 + ty;
int col = bx * 16 + tx;
float result = 0;
for (int m = 0; m < width / 16; m++) {
//collectively load the A and B tiles into shared memory
Ashare[ty][tx] = A[(row * width) + (m * 16) + tx];
Bshare[ty][tx] = B[(((m * 16) + ty) * width) + col];
__syncthreads(); //wait for all the shared memory to be loaded
for (int k = 0; k < 16; k++) {
result += Ashare[ty][k] * Bshare[k][tx];
}
__syncthreads();
}
C[(row * width) + col] = result;
}
""")
knl = mod.get_function("dot")
warp_size=32
results = []
for mat_width in [2**8,2**9,2**10,2**11,2**12,2**13,2**14]:
#mat_width = 256
nb_columnsA=mat_width
nb_linesA=mat_width
nb_columnsB=mat_width
nb_linesB=nb_columnsA
a=np.random.rand(nb_linesA,nb_columnsA).astype(np.float32)
b=np.random.rand(nb_linesB,nb_columnsB).astype(np.float32)
start = time.time()
numpy_dot = np.dot(a,b)
end = time.time()
sec1 = end - start
print ("CPU time", sec1)
#print ("numpy_dot", numpy_dot)
start = time.time()
a_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
b_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
c_gpu = gpuarray.empty((nb_linesA, nb_columnsB), np.float32)
threadPerBlockx=16
threadPerBlocky=16
size_Cx = nb_columnsB
size_Cy = nb_linesA
BlockPerGridx = (int) (size_Cx // threadPerBlockx);
BlockPerGridy = (int) (size_Cy // threadPerBlockx);
knl(c_gpu, a_gpu, b_gpu, np.int32(mat_width),\
block = (threadPerBlockx, threadPerBlocky, 1), grid=(BlockPerGridx,BlockPerGridy))
end = time.time()
sec2 = end - start
print ("GPU time", sec2)
print('Speed difference of numpy and pycuda: {:0.1f}x'.format(sec1 / sec2))
print (np.allclose(c_gpu.get(),numpy_dot))
start = time.time()
c_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
d_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
ressk = culinalg.dot(c_gpu, d_gpu)
end = time.time()
sec3 = end - start
ressk = ressk.get()
print ("SKC time", sec3)
print('Speed difference of pycuda and skcuda: {:0.1f}x'.format(sec2 / sec3))
print (np.allclose(numpy_dot,ressk))
results.append([mat_width,sec1,sec2,sec3])
結局、pycudaとskcudaの内積計算速度はほぼ同じという結果になった。ということは、過去の速度比較の結果が正確でないということになるので、何ともやるせない。
真の速度比較¶
真の速度比較には、データの取り出し(.get())もtimingに含めないといけないらしいので、それらも含めた真のdot product速度比較をしてみた。
import pycuda.autoinit
import numpy as np
import pycuda.gpuarray as gpuarray
import time
import cupy
from pycuda.compiler import SourceModule
import skcuda.linalg as culinalg
import skcuda
culinalg.init()
mod = SourceModule("""
__global__ void dot(float* C, float* A, float* B, int width) {
__shared__ float Ashare[16][16];
__shared__ float Bshare[16][16];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * 16 + ty;
int col = bx * 16 + tx;
float result = 0;
for (int m = 0; m < width / 16; m++) {
//collectively load the A and B tiles into shared memory
Ashare[ty][tx] = A[(row * width) + (m * 16) + tx];
Bshare[ty][tx] = B[(((m * 16) + ty) * width) + col];
__syncthreads(); //wait for all the shared memory to be loaded
for (int k = 0; k < 16; k++) {
result += Ashare[ty][k] * Bshare[k][tx];
}
__syncthreads();
}
C[(row * width) + col] = result;
}
""")
knl = mod.get_function("dot")
warp_size=32
results = []
for mat_width in [2**8,2**9,2**10,2**11,2**12,2**13,2**14]:
#mat_width = 256
nb_columnsA=mat_width
nb_linesA=mat_width
nb_columnsB=mat_width
nb_linesB=nb_columnsA
a=np.random.rand(nb_linesA,nb_columnsA).astype(np.float32)
b=np.random.rand(nb_linesB,nb_columnsB).astype(np.float32)
start = time.time()
numpy_dot = np.dot(a,b)
end = time.time()
sec1 = end - start
print ("CPU time", sec1)
#print ("numpy_dot", numpy_dot)
start = time.time()
a_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
b_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
c_gpu = gpuarray.empty((nb_linesA, nb_columnsB), np.float32)
threadPerBlockx=16
threadPerBlocky=16
size_Cx = nb_columnsB
size_Cy = nb_linesA
BlockPerGridx = (int) (size_Cx // threadPerBlockx);
BlockPerGridy = (int) (size_Cy // threadPerBlockx);
knl(c_gpu, a_gpu, b_gpu, np.int32(mat_width),\
block = (threadPerBlockx, threadPerBlocky, 1), grid=(BlockPerGridx,BlockPerGridy))
pycuda_dot = c_gpu.get()
end = time.time()
sec2 = end - start
print ("GPU time", sec2)
print('Speed difference of numpy and pycuda: {:0.1f}x'.format(sec1 / sec2))
print (np.allclose(pycuda_dot,numpy_dot))
start = time.time()
c_gpu = gpuarray.to_gpu(np.array(a,dtype = np.float32))
d_gpu = gpuarray.to_gpu(np.array(b,dtype = np.float32))
ressk = culinalg.dot(c_gpu, d_gpu)
ressk = ressk.get()
end = time.time()
sec3 = end - start
print ("SKC time", sec3)
print('Speed difference of pycuda and skcuda: {:0.1f}x'.format(sec2 / sec3))
print (np.allclose(numpy_dot,ressk))
results.append([mat_width,sec1,sec2,sec3])
import matplotlib.pyplot as plt
import numpy as np
results = np.array(results)
legends = []
nH = results[:7, 0:1]
rows = results[:7,1:6]
plt.semilogx(nH,rows, 'o-')
legends += ['' + s for s in ['CPU','GPU','SK-CUDA','cupy']]
plt.rcParams['figure.figsize'] = 18, 10
plt.rcParams["font.size"] = "20"
plt.ylabel('Seconds')
plt.xlabel('Value of mat_width')
plt.legend(legends);
実際の速度は、numpyとpycudaにそれ程差はなく、numpyとskcudaの速度差が13倍だった。結論としては、内積計算にはscikit-cudaを使うのがベストのようだ。
参考サイトhttps://devtalk.nvidia.com/