# PyCUDAプログラミング：GPUとCPUのドット積比較

GPUとCPUが2つのベクトルのドット積を求めるのに費やす時間は、アレイサイズによってかなり大きく変わってくる。アレイサイズが小さいとCPUが圧倒的に速く、アレイサイズが大きくなるに連れてGPUがCPUを速度で追い抜く。と言っても、両者の間にそんなに顕著な差はない。限界の16384×16384でもその差は2.3倍でしかない。

スポンサーリンク

## CPUとGPUのドット積速度比較¶

このサイトから以下のベンチマークコードを拝借させてもらった。

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division

"""
Multiples two square matrices together using multiple blocks and shared memory.
Each thread block is assigned a "tile" of the resulting matrix and is responsible
for generating the elements in that tile. Each thread in a block computes one element
of the tile.
"""
import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 20, 15
pylab.rcParams["font.size"] = "19"
import numpy as np
from numpy import linalg as la
import time
import matplotlib.pylab as plt

from pycuda import driver, compiler, gpuarray, tools
# -- initialize the device
import pycuda.autoinit

kernel_code_template = """
__global__ void MatrixMulKernel(float *A, float *B, float *C)
{

const uint wA = %(MATRIX_SIZE)s;
const uint wB = %(MATRIX_SIZE)s;

// Block index
const uint bx = blockIdx.x;
const uint by = blockIdx.y;

const uint tx = threadIdx.x;
const uint ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block
const uint aBegin = wA * %(BLOCK_SIZE)s * by;
// Index of the last sub-matrix of A processed by the block
const uint aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
const uint aStep = %(BLOCK_SIZE)s;

// Index of the first sub-matrix of B processed by the block
const uint bBegin = %(BLOCK_SIZE)s * bx;
// Step size used to iterate through the sub-matrices of B
const uint bStep = %(BLOCK_SIZE)s * wB;

// The element of the block sub-matrix that is computed
// by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B required to
// compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
{
// Shared memory for the sub-matrix of A
__shared__ float As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
// Shared memory for the sub-matrix of B
__shared__ float Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];

// Load the matrices from global memory to shared memory
// each thread loads one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded

// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < %(BLOCK_SIZE)s; ++k)
Csub += As[ty][k] * Bs[k][tx];

// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
}

// Write the block sub-matrix to global memory;
// each thread writes one element
const uint c = wB * %(BLOCK_SIZE)s * by + %(BLOCK_SIZE)s * bx;
C[c + wB * ty + tx] = Csub;
}
"""

def benchmarkCPU(scale):
rsCPU = []

print ('Start CPU processing')

for scaleFactor in range(scale):

# load the matrices

MATRIX_SIZE = 2**(scaleFactor) * 16
print ("==" * 100)
print ('Loading matrix size of ' + str(MATRIX_SIZE))

# compute reference on the CPU to verify GPU computation
at1 = time.time()
c_cpu = np.dot(a_cpu, b_cpu)
at2 = time.time()
dt12 = (at2 - at1)*1000
print ("CPU time used:", dt12, " ms ")

# save the results in npz
np.savez('cpu_res_{}.npz'.format(scaleFactor), c_cpu)
rsCPU.append(dt12)

return rsCPU

def benchmarkGPU(scale):
rsGPU = []
rsCOPY= []
print ('Start GPU processing')

# define size of blocks and tiles sub-matrix
# (we assume that the block size is same as tile size)
TILE_SIZE = 16
BLOCK_SIZE = TILE_SIZE

for scaleFactor in range(scale):

MATRIX_SIZE = 2 ** (scaleFactor) * 16

print ("==" * 100)
print ('Loading Matrix size of ' + str(MATRIX_SIZE))

# load the matrices

at1 = time.time()
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)

at2 = time.time()
dt12= (at2-at1)*1000

print ("COPY time used:", dt12, " ms ")

# create empty gpu array for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

# get the kernel code from the template
# by specifying the constants MATRIX_SIZE and BLOCK_SIZE
kernel_code = kernel_code_template % {
'MATRIX_SIZE': MATRIX_SIZE,
'BLOCK_SIZE': BLOCK_SIZE,
}

# compile the kernel code
mod = compiler.SourceModule(kernel_code)

# get the kernel function from the compiled module
matrixmul = mod.get_function("MatrixMulKernel")

# call the kernel on the card
matrixmul(
# inputs
a_gpu, b_gpu,
# output
c_gpu,
# grid of multiple blocks
# Andreas' original code is: grid = (MATRIX_SIZE // TILE_SIZE, MATRIX_SIZE // TILE_SIZE),
grid=( (MATRIX_SIZE + TILE_SIZE -1) // TILE_SIZE, (MATRIX_SIZE + TILE_SIZE -1) // TILE_SIZE),
# block of multiple threads
block=(TILE_SIZE, TILE_SIZE, 1),
)

# copy result from GPU
re = c_gpu.get()

at3 = time.time()
dt23 = (at3 - at2)*1000
print ("GPU time used:", dt23, " ms ")

np.savez('gpu_res_{}.npz'.format(scaleFactor), re)

rsGPU.append(dt23)
rsCOPY.append(dt12)

return [rsGPU, rsCOPY]

def calErr(scale):

rsErr=[]
print ('Comparing Error')

for scaleFactor in range(scale):

err = la.norm(res_cpu - res_gpu)
rsErr.append(err)

return rsErr

def generate_mat(scale):
# generate some large matrices and store them as npz files
# I can only try scaleFactor = 9 because of the memory limit of my GPU card.

print ('Generating Matrices')

for scaleFactor in range(scale):
MATRIX_SIZE = 2 ** (scaleFactor) * 16
a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
np.savez('testmat_{}.npz'.format(scaleFactor), a_cpu, b_cpu)

def main():

GSCALE = 11

generate_mat(GSCALE)
rsCPU = benchmarkCPU(GSCALE)
rs = benchmarkGPU(GSCALE)
rsGPU = rs[0]
rsCopy = rs[1]
rsErr= calErr(GSCALE)

labels = [2**(x)*16 for x in range(GSCALE)]
plt.plot(range(GSCALE), rsCPU,'b-', label="CPU processing time")
plt.plot(range(GSCALE), rsGPU,'r-', label="GPU processing time")
plt.plot(range(GSCALE), rsCopy, 'y-', label="Copy processing time")
plt.xticks(range(GSCALE), labels, rotation='vertical')

plt.grid(True, which="major", linestyle="dotted")
plt.yscale("log")

plt.ylabel("Logrithm Response time (msec)")
plt.xlabel("Matrix Size ")

#plt.xticks(fontsize=9)
#plt.yticks(fontsize=9)

plt.legend(loc='upper left', fancybox=True, shadow=True, prop=dict(size=17))

ax2 = plt.twinx()
ax2.set_ylabel('Error', color='g')
ax2.plot(range(GSCALE), rsErr, 'g-', label="Norm difference")

ax2.legend(loc=0)
plt.savefig('test2.png')
plt.show()

if __name__ == "__main__":
main()

Generating Matrices
Start CPU processing
========================================================================================================================================================================================================
CPU time used: 0.08440017700195312  ms
========================================================================================================================================================================================================
CPU time used: 0.04553794860839844  ms
========================================================================================================================================================================================================
CPU time used: 0.04363059997558594  ms
========================================================================================================================================================================================================
CPU time used: 0.1475811004638672  ms
========================================================================================================================================================================================================
CPU time used: 0.3275871276855469  ms
========================================================================================================================================================================================================
CPU time used: 1.6989707946777344  ms
========================================================================================================================================================================================================
CPU time used: 10.755777359008789  ms
========================================================================================================================================================================================================
CPU time used: 53.93242835998535  ms
========================================================================================================================================================================================================
CPU time used: 411.6086959838867  ms
========================================================================================================================================================================================================
CPU time used: 5703.782320022583  ms
========================================================================================================================================================================================================
CPU time used: 48772.47190475464  ms
Start GPU processing
========================================================================================================================================================================================================
COPY time used: 0.5419254302978516  ms
GPU time used: 0.9243488311767578  ms
========================================================================================================================================================================================================
COPY time used: 1.15203857421875  ms
GPU time used: 3.6787986755371094  ms
========================================================================================================================================================================================================
COPY time used: 1.085519790649414  ms
GPU time used: 4.835367202758789  ms
========================================================================================================================================================================================================
COPY time used: 1.8742084503173828  ms
GPU time used: 4.185199737548828  ms
========================================================================================================================================================================================================
COPY time used: 1.0967254638671875  ms
GPU time used: 3.9589405059814453  ms
========================================================================================================================================================================================================
COPY time used: 1.2862682342529297  ms
GPU time used: 3.412008285522461  ms
========================================================================================================================================================================================================
COPY time used: 1.524209976196289  ms
GPU time used: 6.458044052124023  ms
========================================================================================================================================================================================================
COPY time used: 4.068851470947266  ms
GPU time used: 45.39346694946289  ms
========================================================================================================================================================================================================
COPY time used: 14.550209045410156  ms
GPU time used: 322.05748558044434  ms
========================================================================================================================================================================================================
COPY time used: 56.06794357299805  ms
GPU time used: 2444.7052478790283  ms
========================================================================================================================================================================================================