np.dotとtf.matmul、さらに、skcuda.linalg.dotとtf.matmulの速度比較をしてみた。一番遅いのがnumpyだというのは分かるが、skcudaとtensorflowはどっちが速いのか興味があった。

NumPy vs. TensorFlow¶

import numpy as np
import tensorflow as tf
import time

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.667)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

dim = 10000
rnd = np.random.RandomState(0)
a = rnd.rand(dim, dim).astype(np.float32)
b = rnd.rand(dim, dim).astype(np.float32)

start = time.time()
rescpu = np.dot(a, b)
c = time.time() - start
print ('CPU:', c)

X = tf.placeholder(tf.float32, shape=(dim, dim), name=None)
Y = tf.placeholder(tf.float32, shape=(dim, dim), name=None)
Z = tf.matmul(X, Y)

sess = tf.Session()

start = time.time()
resgpu = sess.run(Z, feed_dict={X: a, Y: b})
d = time.time() - start
print ('GPU:', d)
print('Speed difference: {:0.1f}x'.format(c / d))
print (np.allclose(rescpu, resgpu))
print (np.allclose(resgpu, rescpu))

CPU: 8.533316850662231
GPU: 0.815709114074707
Speed difference: 10.5x
True
True

予想通り、numpyよりもtensorflowの方が処理速度が10.5倍も速かった。

scikit-cuda vs. TensorFlow¶

import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy as np, time
import skcuda.linalg as culinalg
import skcuda
culinalg.init()
import tensorflow as tf
import time

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.667)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

dim = 10000
rnd = np.random.RandomState(0)
a = rnd.rand(dim, dim).astype(np.float32)
b = rnd.rand(dim, dim).astype(np.float32)
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)

start = time.time()
ressk = culinalg.dot(a_gpu, b_gpu)
c = time.time() - start
print ('sk:', c)
ressk = ressk.get()

X = tf.placeholder(tf.float32, shape=(dim, dim), name=None)
Y = tf.placeholder(tf.float32, shape=(dim, dim), name=None)
Z = tf.matmul(X, Y)
sess = tf.Session()
start = time.time()
restf = sess.run(Z, feed_dict={X: a, Y: b})
d = time.time() - start
print ('tf:', d)
print ('Speed difference: {:0.1f}x'.format(d / c))
print (np.allclose(ressk, restf))
print (np.allclose(restf, ressk))

sk: 0.029579877853393555
tf: 0.7552328109741211
Speed difference: 25.5x
True
True

skcudaの方がtensorflowよりも25.5倍処理速度が高速だった。これは意外だった。

scikit-cuda vs. NumPy¶

import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy as np, time
import skcuda.linalg as culinalg
import skcuda
import time
culinalg.init()

dim = 11900
rnd = np.random.RandomState(0)
a = rnd.rand(dim, dim).astype(np.float32)
b = rnd.rand(dim, dim).astype(np.float32)
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)

start = time.time()
rescpu = np.dot(a, b)
c = time.time() - start
print ('CPU:', c)

start = time.time()
resgpu = culinalg.dot(a_gpu, b_gpu)
d = time.time() - start
print ('GPU:', d)
resgpu = resgpu.get()
print('Speed difference: {:0.1f}x'.format(c / d))
print (np.allclose(rescpu, resgpu))
print (np.allclose(resgpu, rescpu))

CPU: 14.590681076049805
GPU: 0.024799346923828125
Speed difference: 588.3x
True
True

skcudaとnumpyを直接比較すると、skcudaの方が588倍処理が高速だった。

参考サイトMatrix Multiplication Benchmark