今日はこのサイトを参考にして、CUDA C Wrapperを作成して、ctypesを使ってpythonで関数を利用する。

CUDA C Wrapperの作成¶

#include <cuda.h>
#include <cuda_runtime_api.h>

__global__ void cuda_sum_kernel(float *a, float *b, float *c, size_t size)
{
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= size) {
        return;
    }

    c[idx] = a[idx] + b[idx];
}

extern "C" {
void cuda_sum(float *a, float *b, float *c, size_t size)
{
    float *d_a, *d_b, *d_c;

    cudaMalloc((void **)&d_a, size * sizeof(float));
    cudaMalloc((void **)&d_b, size * sizeof(float));
    cudaMalloc((void **)&d_c, size * sizeof(float));

    cudaMemcpy(d_a, a, size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size * sizeof(float), cudaMemcpyHostToDevice);

    cuda_sum_kernel <<< ceil(size / 256.0), 256 >>> (d_a, d_b, d_c, size);

    cudaMemcpy(c, d_c, size * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
}
}

Writing cuda_sum.cu

!nvcc -Xcompiler -fPIC -shared -o cuda_sum.so cuda_sum.cu

CUDA C Wrapperを使う¶

import numpy as np
import ctypes
from ctypes import *

# extract cuda_sum function pointer in the shared object cuda_sum.so
def get_cuda_sum():
    dll = ctypes.CDLL('./cuda_sum.so', mode=ctypes.RTLD_GLOBAL)
    func = dll.cuda_sum
    func.argtypes = [POINTER(c_float), POINTER(c_float), POINTER(c_float), c_size_t]
    return func

# create __cuda_sum function with get_cuda_sum()
__cuda_sum = get_cuda_sum()

# convenient python wrapper for __cuda_sum
# it does all job with types convertation
# from python ones to C++ ones
def cuda_sum(a, b, c, size):
    a_p = a.ctypes.data_as(POINTER(c_float))
    b_p = b.ctypes.data_as(POINTER(c_float))
    c_p = c.ctypes.data_as(POINTER(c_float))

    __cuda_sum(a_p, b_p, c_p, size)

# testing, sum of two arrays of ones and output head part of resulting array
if __name__ == '__main__':
    size=int(1024*1024)

    a = np.ones(size).astype('float32')
    b = np.ones(size).astype('float32')
    c = np.zeros(size).astype('float32')

    cuda_sum(a, b, c, size)
    print (a[:20])
    print (b[:20])
    print (c[:20])

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]

PyCUDAを使う¶

from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
from pycuda.compiler import SourceModule

mod = SourceModule(open("cuda_sum.cu", "r").read())

importedKernel = mod.get_function("cuda_sum_kernel")

for i in range(5):
    a = i
    b = i*i
    a_gpu = gpuarray.to_gpu(np.array([a],dtype = np.float32))
    b_gpu = gpuarray.to_gpu(np.array([b],dtype = np.float32))
    c_gpu = gpuarray.to_gpu(np.empty_like(1)).astype(np.float32)
    importedKernel(a_gpu,b_gpu,c_gpu,block=(1,1,1))
    print(a, '+',b, '=', c_gpu )

0 + 0 = 0.0
1 + 1 = 2.0
2 + 4 = 6.0
3 + 9 = 12.0
4 + 16 = 20.0

for i in range(5):
    a_gpu = gpuarray.to_gpu(np.array(i))
    b_gpu = gpuarray.to_gpu(np.array(i*i))
    c_gpu = gpuarray.to_gpu(np.zeros_like(0))
    importedKernel(a_gpu,b_gpu,c_gpu,block=(1,1,1))
    print(a_gpu, '+',b_gpu, '=', c_gpu )

0 + 0 = 0
1 + 1 = 2
2 + 4 = 6
3 + 9 = 12
4 + 16 = 20

CUDA C Wrapperを使うよりもPyCUDAを使った方が圧倒的にスマートなコードになる。