今日はこのサイトを参考にして、CUDA C Wrapperを作成して、ctypesを使ってpythonで関数を利用する。
スポンサーリンク
CUDA C Wrapperの作成¶
#include <cuda.h>
#include <cuda_runtime_api.h>
__global__ void cuda_sum_kernel(float *a, float *b, float *c, size_t size)
{
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) {
return;
}
c[idx] = a[idx] + b[idx];
}
extern "C" {
void cuda_sum(float *a, float *b, float *c, size_t size)
{
float *d_a, *d_b, *d_c;
cudaMalloc((void **)&d_a, size * sizeof(float));
cudaMalloc((void **)&d_b, size * sizeof(float));
cudaMalloc((void **)&d_c, size * sizeof(float));
cudaMemcpy(d_a, a, size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size * sizeof(float), cudaMemcpyHostToDevice);
cuda_sum_kernel <<< ceil(size / 256.0), 256 >>> (d_a, d_b, d_c, size);
cudaMemcpy(c, d_c, size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
}
!nvcc -Xcompiler -fPIC -shared -o cuda_sum.so cuda_sum.cu
スポンサーリンク
CUDA C Wrapperを使う¶
import numpy as np
import ctypes
from ctypes import *
# extract cuda_sum function pointer in the shared object cuda_sum.so
def get_cuda_sum():
dll = ctypes.CDLL('./cuda_sum.so', mode=ctypes.RTLD_GLOBAL)
func = dll.cuda_sum
func.argtypes = [POINTER(c_float), POINTER(c_float), POINTER(c_float), c_size_t]
return func
# create __cuda_sum function with get_cuda_sum()
__cuda_sum = get_cuda_sum()
# convenient python wrapper for __cuda_sum
# it does all job with types convertation
# from python ones to C++ ones
def cuda_sum(a, b, c, size):
a_p = a.ctypes.data_as(POINTER(c_float))
b_p = b.ctypes.data_as(POINTER(c_float))
c_p = c.ctypes.data_as(POINTER(c_float))
__cuda_sum(a_p, b_p, c_p, size)
# testing, sum of two arrays of ones and output head part of resulting array
if __name__ == '__main__':
size=int(1024*1024)
a = np.ones(size).astype('float32')
b = np.ones(size).astype('float32')
c = np.zeros(size).astype('float32')
cuda_sum(a, b, c, size)
print (a[:20])
print (b[:20])
print (c[:20])
スポンサーリンク
PyCUDAを使う¶
from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
from pycuda.compiler import SourceModule
mod = SourceModule(open("cuda_sum.cu", "r").read())
importedKernel = mod.get_function("cuda_sum_kernel")
for i in range(5):
a = i
b = i*i
a_gpu = gpuarray.to_gpu(np.array([a],dtype = np.float32))
b_gpu = gpuarray.to_gpu(np.array([b],dtype = np.float32))
c_gpu = gpuarray.to_gpu(np.empty_like(1)).astype(np.float32)
importedKernel(a_gpu,b_gpu,c_gpu,block=(1,1,1))
print(a, '+',b, '=', c_gpu )
for i in range(5):
a_gpu = gpuarray.to_gpu(np.array(i))
b_gpu = gpuarray.to_gpu(np.array(i*i))
c_gpu = gpuarray.to_gpu(np.zeros_like(0))
importedKernel(a_gpu,b_gpu,c_gpu,block=(1,1,1))
print(a_gpu, '+',b_gpu, '=', c_gpu )
スポンサーリンク
スポンサーリンク