pycudaで構築したdot productコードとscikit-cudaに実装されているモジュールで2つのアレイのdot product(ドット積)をテストしてみる。

dot productテスト(skcuda編)¶

skcuda.linalg.dotがnumpy.dotと整合するかどうかを確認する。

>>> import pycuda.autoinit
>>> import pycuda.gpuarray as gpuarray
>>> import numpy as np
>>> import skcuda.linalg as linalg
>>> import skcuda.misc as misc
>>> linalg.init()
>>> a = np.asarray(np.random.rand(4, 2), np.float32)
>>> b = np.asarray(np.random.rand(2, 2), np.float32)
>>> a_gpu = gpuarray.to_gpu(a)
>>> b_gpu = gpuarray.to_gpu(b)
>>> c_gpu = linalg.dot(a_gpu, b_gpu)
>>> np.allclose(np.dot(a, b), c_gpu.get())

True

>>> d = np.asarray(np.random.rand(5), np.float32)
>>> e = np.asarray(np.random.rand(5), np.float32)
>>> d_gpu = gpuarray.to_gpu(d)
>>> e_gpu = gpuarray.to_gpu(e)
>>> f = linalg.dot(d_gpu, e_gpu)
>>> np.allclose(np.dot(d, e), f)

True

skcuda.linalg.dotとnumpy.dotの速度を比較する。

%timeit -n10000 -r10 linalg.dot(a_gpu, b_gpu)
%timeit -n10000 -r10 np.dot(a, b)

46.2 µs ± 1.64 µs per loop (mean ± std. dev. of 10 runs, 10000 loops each)
586 ns ± 13.9 ns per loop (mean ± std. dev. of 10 runs, 10000 loops each)

from time import time

st = time()
linalg.dot(a_gpu, b_gpu)
gpu_time = (time() - st)
print ("GPU: ", gpu_time)

st = time()
np.dot(a, b)
cpu_time = (time() - st)
print ("CPU: ", cpu_time)
print ("speedup: ", gpu_time/cpu_time)

GPU:  0.00083160400390625
CPU:  0.00017142295837402344
speedup:  4.851182197496523

dot productテスト(PyCUDA編)¶

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy

MATRIX_SIZE = 10

a = numpy.random.randint(2, size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
b = numpy.random.randint(2, size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)

a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), numpy.float32)

mod = SourceModule("""
__global__ void dot(float *a, float *b, float *c)
{
    // 2D Thread ID (assuming that only *one* block will be executed)
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Pvalue is used to store the element of the matrix
    // that is computed by the thread
    float Pvalue = 0;

    // Each thread loads one row of M and one column of N, 
    //   to produce one element of P.
    for (int k = 0; k < %(MATRIX_SIZE)s; ++k) {
        float Aelement = a[ty * %(MATRIX_SIZE)s + k];
        float Belement = b[k * %(MATRIX_SIZE)s + tx];
        Pvalue += Aelement * Belement;
    }

    // Write the matrix to device memory;
    // each thread writes one element
    c[ty * %(MATRIX_SIZE)s + tx] = Pvalue;
}
""" % {
    'MATRIX_SIZE': MATRIX_SIZE 
    })

func = mod.get_function("dot")
func(a_gpu, b_gpu, c_gpu, \
     block=(MATRIX_SIZE, MATRIX_SIZE, 1))
print(a)
print(b)
print(c_gpu.get())

[[0. 0. 0. 0. 1. 0. 1. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 1. 0.]
 [1. 1. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 1. 0. 1. 0. 0. 1.]
 [0. 1. 0. 1. 0. 0. 1. 1. 0. 0.]
 [0. 1. 1. 0. 1. 1. 1. 0. 0. 1.]
 [0. 1. 0. 0. 1. 0. 0. 1. 1. 1.]
 [0. 1. 1. 1. 0. 0. 0. 0. 1. 1.]
 [0. 0. 1. 0. 1. 0. 0. 0. 1. 1.]
 [1. 1. 1. 0. 1. 0. 1. 0. 0. 1.]]
[[0. 1. 1. 0. 1. 1. 1. 0. 1. 1.]
 [0. 0. 0. 1. 1. 1. 1. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 1. 0. 1. 0.]
 [1. 1. 0. 0. 1. 0. 1. 0. 0. 1.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 1. 1. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 1. 1. 0. 1. 1. 0. 1.]
 [0. 0. 0. 1. 1. 1. 1. 1. 0. 0.]
 [0. 1. 0. 0. 1. 1. 1. 1. 1. 1.]]
[[1. 0. 1. 1. 1. 1. 1. 2. 1. 1.]
 [1. 1. 0. 1. 2. 1. 2. 1. 0. 1.]
 [1. 2. 1. 1. 3. 2. 3. 0. 1. 3.]
 [1. 3. 2. 0. 2. 2. 3. 2. 4. 3.]
 [1. 1. 0. 2. 3. 1. 3. 2. 1. 4.]
 [1. 2. 2. 2. 3. 2. 4. 2. 3. 3.]
 [1. 1. 1. 3. 4. 3. 4. 3. 1. 3.]
 [1. 3. 0. 2. 4. 3. 5. 2. 2. 3.]
 [1. 2. 1. 1. 2. 2. 3. 2. 2. 1.]
 [1. 3. 2. 1. 3. 3. 4. 2. 4. 4.]]

numpy.dotとの整合性を確認する。

np.dot(a, b)

array([[1., 0., 1., 1., 1., 1., 1., 2., 1., 1.],
       [1., 1., 0., 1., 2., 1., 2., 1., 0., 1.],
       [1., 2., 1., 1., 3., 2., 3., 0., 1., 3.],
       [1., 3., 2., 0., 2., 2., 3., 2., 4., 3.],
       [1., 1., 0., 2., 3., 1., 3., 2., 1., 4.],
       [1., 2., 2., 2., 3., 2., 4., 2., 3., 3.],
       [1., 1., 1., 3., 4., 3., 4., 3., 1., 3.],
       [1., 3., 0., 2., 4., 3., 5., 2., 2., 3.],
       [1., 2., 1., 1., 2., 2., 3., 2., 2., 1.],
       [1., 3., 2., 1., 3., 3., 4., 2., 4., 4.]], dtype=float32)

%timeit func(a_gpu, b_gpu, c_gpu, block=(MATRIX_SIZE, MATRIX_SIZE, 1))

15.9 µs ± 61.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

%timeit np.dot(a, b)

637 ns ± 5.16 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

array sizeが小さいとCPUの方が実行速度が速い。

参考サイトhttps://scikit-cuda.readthedocs.io/
参考サイトhttps://wiki.tiker.net/