numba guvectorize (cuda, cpu, parallel)の速度比較

このサイトのコードを拝借して、numba guvectorizeのcuda, cpu, parallelの速度比較をしてみる。

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(int64, float64[:,:], float64[:,:,:], int64, \
int64, float64[:,:,:])'], '(),(m,o),(n,m,o),(),() -> (n,m,o)', \
             target='cuda', nopython=True)
def cVestDiscount (countRow, multBy, discount, n, countCol, cv):
    for as_of_date in range(0,countRow):
        for ID in range(0,countCol):
            for num in range(0,n):
                cv[as_of_date][ID][num] = multBy[ID][num] * \
                discount[as_of_date][ID][num]

countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + \
      str(timing))
Function: discount factor cumVest duration (seconds):0.7110588710056618

target = parallel

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(int64, float64[:,:], float64[:,:,:], int64, \
int64, float64[:,:,:])'], '(),(m,o),(n,m,o),(),() -> (n,m,o)', \
             target='parallel', nopython=True)
def cVestDiscount (countRow, multBy, discount, n, countCol, cv):
    for as_of_date in range(0,countRow):
        for ID in range(0,countCol):
            for num in range(0,n):
                cv[as_of_date][ID][num] = multBy[ID][num] * discount[as_of_date][ID][num]

countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
Function: discount factor cumVest duration (seconds):0.002128449996234849

target = cpu

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(int64, float64[:,:], float64[:,:,:], int64, \
int64, float64[:,:,:])'], '(),(m,o),(n,m,o),(),() -> (n,m,o)', \
             target='cpu', nopython=True)
def cVestDiscount (countRow, multBy, discount, n, countCol, cv):
    for as_of_date in range(0,countRow):
        for ID in range(0,countCol):
            for num in range(0,n):
                cv[as_of_date][ID][num] = multBy[ID][num] * discount[as_of_date][ID][num]

countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
Function: discount factor cumVest duration (seconds):0.0019021409971173853
0.7110588710056618/0.0019021409971173853
373.82027519686585

面白いことに一番高速なのがCPUだった。CPUはCUDAの373倍高速だった。以下に上記のコードのcuda版のプロファイルを載せておく。

!nvprof --print-gpu-trace python t14.py
==6092== NVPROF is profiling process 6092, command: python t14.py
Function: discount factor cumVest duration (seconds):0.6521114300121553
==6092== Profiling application: python t14.py
==6092== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
432.63ms     800ns                    -               -         -         -         -        8B  9.5367MB/s    Pageable      Device  GeForce GTX 106         1         7  [CUDA memcpy HtoD]
432.74ms  13.504us                    -               -         -         -         -  156.25KB  11.035GB/s    Pageable      Device  GeForce GTX 106         1         7  [CUDA memcpy HtoD]
433.08ms  1.7421ms                    -               -         -         -         -  15.259MB  8.5538GB/s    Pageable      Device  GeForce GTX 106         1         7  [CUDA memcpy HtoD]
435.01ms     576ns                    -               -         -         -         -        8B  13.245MB/s    Pageable      Device  GeForce GTX 106         1         7  [CUDA memcpy HtoD]
435.08ms     544ns                    -               -         -         -         -        8B  14.025MB/s    Pageable      Device  GeForce GTX 106         1         7  [CUDA memcpy HtoD]
435.72ms  641.83ms              (1 1 1)        (32 1 1)       114        0B        0B         -           -           -           -  GeForce GTX 106         1         7  cudapy::__main__::__gufunc_cVestDiscount$242(Array<__int64, int=1, A, mutable, aligned>, Array<double, int=3, A, mutable, aligned>, Array<double, int=4, A, mutable, aligned>, Array<__int64, int=1, A, mutable, aligned>, Array<__int64, int=1, A, mutable, aligned>, Array<double, int=4, A, mutable, aligned>) [36]
1.07755s  5.4983ms                    -               -         -         -         -  15.259MB  2.7101GB/s      Device    Pageable  GeForce GTX 106         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy

改変バージョン

上のコードの改変バージョンを3ターゲットで走らせる。

target = cuda

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(float64[:,:], float64[:,:], int64, int64, \
float64[:,:])'], '(m,o),(m,o),(),() -> (m,o)', target='cuda', \
             nopython=True)
def cVestDiscount (multBy, discount, n, countCol, cv):
        for ID in range(0,countCol):
            for num in range(0,n):
                cv[ID][num] = multBy[ID][num] * discount[ID][num]

multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
Function: discount factor cumVest duration (seconds):0.014195486990502104

target = parallel

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(float64[:,:], float64[:,:], int64, int64, \
float64[:,:])'], '(m,o),(m,o),(),() -> (m,o)', target='parallel', \
             nopython=True)
def cVestDiscount (multBy, discount, n, countCol, cv):
        for ID in range(0,countCol):
            for num in range(0,n):
                cv[ID][num] = multBy[ID][num] * discount[ID][num]

multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
Function: discount factor cumVest duration (seconds):0.0019180210074409842

target = cpu

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(float64[:,:], float64[:,:], int64, int64, \
float64[:,:])'], '(m,o),(m,o),(),() -> (m,o)', target='cpu', \
             nopython=True)
def cVestDiscount (multBy, discount, n, countCol, cv):
        for ID in range(0,countCol):
            for num in range(0,n):
                cv[ID][num] = multBy[ID][num] * discount[ID][num]

multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(100,4000,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(100,4000,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(multBy, discount, n, countCol, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
Function: discount factor cumVest duration (seconds):0.0019167869759257883
0.014195486990502104/0.0019167869759257883
7.405876171318323

CUDA vs. CPUが、373.8倍から7.4倍まで速度が上がった。

改変バージョン2

改変バージョン2 cuda版

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(int64, float64[:], float64[:,:], int64, \
float64[:,:])'], '(),(o),(m,o),() -> (m,o)', target='cuda', nopython=True)
def cVestDiscount (countCol, multBy, discount, n, cv):
        for ID in range(0,countCol):
            for num in range(0,n):
                cv[ID][num] = multBy[num] * discount[ID][num]

countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(4000,100,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(4000,100,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
Function: discount factor cumVest duration (seconds):0.005928767990553752

改変バージョン2 cpu版

import numpy as np
from numba import guvectorize
import time
from timeit import default_timer as timer

@guvectorize(['void(int64, float64[:], float64[:,:], int64, \
float64[:,:])'], '(),(o),(m,o),() -> (m,o)', target='cpu', nopython=True)
def cVestDiscount (countCol, multBy, discount, n, cv):
        for ID in range(0,countCol):
            for num in range(0,n):
                cv[ID][num] = multBy[num] * discount[ID][num]

countRow = np.int64(100)
multBy = np.float64(np.arange(20000).reshape(4000,5))
discount = np.float64(np.arange(2000000).reshape(4000,100,5))
n = np.int64(5)
countCol = np.int64(4000)
cv = np.zeros(shape=(4000,100,5), dtype=np.float64)
func_start = timer()
cv = cVestDiscount(countRow, multBy, discount, n, cv)
timing=timer()-func_start
print("Function: discount factor cumVest duration (seconds):" + str(timing))
Function: discount factor cumVest duration (seconds):0.0018708889838308096
0.005928767990553752/0.0018708889838308096
3.168957667607876

cudaとcpuの速度差が3.2倍まで縮まった。

グリッドサイズとブロックサイズの違いによって速度が変わるらしい。