# numba guvectorize (cuda, cpu, parallel)の速度比較

このサイトのコードを拝借して、numba guvectorizeのcuda, cpu, parallelの速度比較をしてみる。

スポンサーリンク

## target = cuda¶

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(int64, float64[:,:], float64[:,:,:], int64, \
int64, float64[:,:,:])'], '(),(m,o),(n,m,o),(),() -> (n,m,o)', \
target='cuda', nopython=True)
def cVestDiscount (countRow, multBy, discount, n, countCol, cv):
for as_of_date in range(0,countRow):
for ID in range(0,countCol):
for num in range(0,n):
cv[as_of_date][ID][num] = multBy[ID][num] * \
discount[as_of_date][ID][num]

countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + \
str(timing))

Function: discount factor cumVest duration (seconds):0.7110588710056618

スポンサーリンク

## target = parallel¶

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(int64, float64[:,:], float64[:,:,:], int64, \
int64, float64[:,:,:])'], '(),(m,o),(n,m,o),(),() -> (n,m,o)', \
target='parallel', nopython=True)
def cVestDiscount (countRow, multBy, discount, n, countCol, cv):
for as_of_date in range(0,countRow):
for ID in range(0,countCol):
for num in range(0,n):
cv[as_of_date][ID][num] = multBy[ID][num] * discount[as_of_date][ID][num]

countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))

Function: discount factor cumVest duration (seconds):0.002128449996234849

スポンサーリンク

## target = cpu¶

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(int64, float64[:,:], float64[:,:,:], int64, \
int64, float64[:,:,:])'], '(),(m,o),(n,m,o),(),() -> (n,m,o)', \
target='cpu', nopython=True)
def cVestDiscount (countRow, multBy, discount, n, countCol, cv):
for as_of_date in range(0,countRow):
for ID in range(0,countCol):
for num in range(0,n):
cv[as_of_date][ID][num] = multBy[ID][num] * discount[as_of_date][ID][num]

countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))

Function: discount factor cumVest duration (seconds):0.0019021409971173853

0.7110588710056618/0.0019021409971173853

373.82027519686585

!nvprof --print-gpu-trace python t14.py

==6092== NVPROF is profiling process 6092, command: python t14.py
Function: discount factor cumVest duration (seconds):0.6521114300121553
==6092== Profiling application: python t14.py
==6092== Profiling result:
Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
432.63ms     800ns                    -               -         -         -         -        8B  9.5367MB/s    Pageable      Device  GeForce GTX 106         1         7  [CUDA memcpy HtoD]
432.74ms  13.504us                    -               -         -         -         -  156.25KB  11.035GB/s    Pageable      Device  GeForce GTX 106         1         7  [CUDA memcpy HtoD]
433.08ms  1.7421ms                    -               -         -         -         -  15.259MB  8.5538GB/s    Pageable      Device  GeForce GTX 106         1         7  [CUDA memcpy HtoD]
435.01ms     576ns                    -               -         -         -         -        8B  13.245MB/s    Pageable      Device  GeForce GTX 106         1         7  [CUDA memcpy HtoD]
435.08ms     544ns                    -               -         -         -         -        8B  14.025MB/s    Pageable      Device  GeForce GTX 106         1         7  [CUDA memcpy HtoD]
435.72ms  641.83ms              (1 1 1)        (32 1 1)       114        0B        0B         -           -           -           -  GeForce GTX 106         1         7  cudapy::__main__::__gufunc_cVestDiscount\$242(Array<__int64, int=1, A, mutable, aligned>, Array<double, int=3, A, mutable, aligned>, Array<double, int=4, A, mutable, aligned>, Array<__int64, int=1, A, mutable, aligned>, Array<__int64, int=1, A, mutable, aligned>, Array<double, int=4, A, mutable, aligned>) [36]
1.07755s  5.4983ms                    -               -         -         -         -  15.259MB  2.7101GB/s      Device    Pageable  GeForce GTX 106         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy

スポンサーリンク

## 改変バージョン¶

### target = cuda¶

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(float64[:,:], float64[:,:], int64, int64, \
float64[:,:])'], '(m,o),(m,o),(),() -> (m,o)', target='cuda', \
nopython=True)
def cVestDiscount (multBy, discount, n, countCol, cv):
for ID in range(0,countCol):
for num in range(0,n):
cv[ID][num] = multBy[ID][num] * discount[ID][num]

multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))

Function: discount factor cumVest duration (seconds):0.014195486990502104


### target = parallel¶

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(float64[:,:], float64[:,:], int64, int64, \
float64[:,:])'], '(m,o),(m,o),(),() -> (m,o)', target='parallel', \
nopython=True)
def cVestDiscount (multBy, discount, n, countCol, cv):
for ID in range(0,countCol):
for num in range(0,n):
cv[ID][num] = multBy[ID][num] * discount[ID][num]

multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))

Function: discount factor cumVest duration (seconds):0.0019180210074409842


### target = cpu¶

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(float64[:,:], float64[:,:], int64, int64, \
float64[:,:])'], '(m,o),(m,o),(),() -> (m,o)', target='cpu', \
nopython=True)
def cVestDiscount (multBy, discount, n, countCol, cv):
for ID in range(0,countCol):
for num in range(0,n):
cv[ID][num] = multBy[ID][num] * discount[ID][num]

multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))

Function: discount factor cumVest duration (seconds):0.0019167869759257883

0.014195486990502104/0.0019167869759257883

7.405876171318323

CUDA vs. CPUが、373.8倍から7.4倍まで速度が上がった。

スポンサーリンク

## 改変バージョン2¶

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(int64, float64[:], float64[:,:], int64, \
float64[:,:])'], '(),(o),(m,o),() -> (m,o)', target='cuda', nopython=True)
def cVestDiscount (countCol, multBy, discount, n, cv):
for ID in range(0,countCol):
for num in range(0,n):
cv[ID][num] = multBy[num] * discount[ID][num]

countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(4000,100,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(4000,100,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))

Function: discount factor cumVest duration (seconds):0.005928767990553752


import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(int64, float64[:], float64[:,:], int64, \
float64[:,:])'], '(),(o),(m,o),() -> (m,o)', target='cpu', nopython=True)
def cVestDiscount (countCol, multBy, discount, n, cv):
for ID in range(0,countCol):
for num in range(0,n):
cv[ID][num] = multBy[num] * discount[ID][num]

countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(4000,100,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(4000,100,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))

Function: discount factor cumVest duration (seconds):0.0018708889838308096

0.005928767990553752/0.0018708889838308096

3.168957667607876

cudaとcpuの速度差が3.2倍まで縮まった。

グリッドサイズとブロックサイズの違いによって速度が変わるらしい。

スポンサーリンク
スポンサーリンク

フォローする