このサイトのコードを拝借して、numba guvectorizeのcuda, cpu, parallelの速度比較をしてみる。
スポンサーリンク
target = cuda¶
import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer
@guvectorize(['void(int64, float64[:,:], float64[:,:,:], int64, \
int64, float64[:,:,:])'], '(),(m,o),(n,m,o),(),() -> (n,m,o)', \
target='cuda', nopython=True)
def cVestDiscount (countRow, multBy, discount, n, countCol, cv):
for as_of_date in range(0,countRow):
for ID in range(0,countCol):
for num in range(0,n):
cv[as_of_date][ID][num] = multBy[ID][num] * \
discount[as_of_date][ID][num]
countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + \
str(timing))
スポンサーリンク
target = parallel¶
import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer
@guvectorize(['void(int64, float64[:,:], float64[:,:,:], int64, \
int64, float64[:,:,:])'], '(),(m,o),(n,m,o),(),() -> (n,m,o)', \
target='parallel', nopython=True)
def cVestDiscount (countRow, multBy, discount, n, countCol, cv):
for as_of_date in range(0,countRow):
for ID in range(0,countCol):
for num in range(0,n):
cv[as_of_date][ID][num] = multBy[ID][num] * discount[as_of_date][ID][num]
countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
スポンサーリンク
target = cpu¶
import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer
@guvectorize(['void(int64, float64[:,:], float64[:,:,:], int64, \
int64, float64[:,:,:])'], '(),(m,o),(n,m,o),(),() -> (n,m,o)', \
target='cpu', nopython=True)
def cVestDiscount (countRow, multBy, discount, n, countCol, cv):
for as_of_date in range(0,countRow):
for ID in range(0,countCol):
for num in range(0,n):
cv[as_of_date][ID][num] = multBy[ID][num] * discount[as_of_date][ID][num]
countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
0.7110588710056618/0.0019021409971173853
!nvprof --print-gpu-trace python t14.py
スポンサーリンク
改変バージョン¶
上のコードの改変バージョンを3ターゲットで走らせる。
target = cuda¶
import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer
@guvectorize(['void(float64[:,:], float64[:,:], int64, int64, \
float64[:,:])'], '(m,o),(m,o),(),() -> (m,o)', target='cuda', \
nopython=True)
def cVestDiscount (multBy, discount, n, countCol, cv):
for ID in range(0,countCol):
for num in range(0,n):
cv[ID][num] = multBy[ID][num] * discount[ID][num]
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
target = parallel¶
import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer
@guvectorize(['void(float64[:,:], float64[:,:], int64, int64, \
float64[:,:])'], '(m,o),(m,o),(),() -> (m,o)', target='parallel', \
nopython=True)
def cVestDiscount (multBy, discount, n, countCol, cv):
for ID in range(0,countCol):
for num in range(0,n):
cv[ID][num] = multBy[ID][num] * discount[ID][num]
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
target = cpu¶
import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer
@guvectorize(['void(float64[:,:], float64[:,:], int64, int64, \
float64[:,:])'], '(m,o),(m,o),(),() -> (m,o)', target='cpu', \
nopython=True)
def cVestDiscount (multBy, discount, n, countCol, cv):
for ID in range(0,countCol):
for num in range(0,n):
cv[ID][num] = multBy[ID][num] * discount[ID][num]
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
0.014195486990502104/0.0019167869759257883
CUDA vs. CPUが、373.8倍から7.4倍まで速度が上がった。
スポンサーリンク
改変バージョン2¶
改変バージョン2 cuda版
import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer
@guvectorize(['void(int64, float64[:], float64[:,:], int64, \
float64[:,:])'], '(),(o),(m,o),() -> (m,o)', target='cuda', nopython=True)
def cVestDiscount (countCol, multBy, discount, n, cv):
for ID in range(0,countCol):
for num in range(0,n):
cv[ID][num] = multBy[num] * discount[ID][num]
countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(4000,100,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(4000,100,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
改変バージョン2 cpu版
import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer
@guvectorize(['void(int64, float64[:], float64[:,:], int64, \
float64[:,:])'], '(),(o),(m,o),() -> (m,o)', target='cpu', nopython=True)
def cVestDiscount (countCol, multBy, discount, n, cv):
for ID in range(0,countCol):
for num in range(0,n):
cv[ID][num] = multBy[num] * discount[ID][num]
countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(4000,100,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(4000,100,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
0.005928767990553752/0.0018708889838308096
cudaとcpuの速度差が3.2倍まで縮まった。
グリッドサイズとブロックサイズの違いによって速度が変わるらしい。
スポンサーリンク
スポンサーリンク