c, cuda, pycudaを使って、$ i+i^i $をプログラミングする。

i+iのi乗の計算(C version)¶

cで$ i+i^i $をプログラミングすると以下のようなコードになる。

%%writefile cpuAdd.c
#include <stdio.h>

int main(void)
{
int N = 10;
float a[N],b[N],c[N];

for (int i = 0; i < N; ++i){
	a[i] = i;
	b[i] = i*i;	
}

for (int i = 0; i < N; ++i){
	c[i]= a[i]+b[i];	
}

for (int i = 0; i < N; ++i){
	printf("%f \n",c[i]);	
}
return 0;
}

Overwriting cpuAdd.c

!g++ cpuAdd.c -o cpua

!./cpua

i+iのi乗の計算(Cuda version)¶

cuda版は以下のように非常に長ったらしいコードになる。

%%writefile gpuAdd.cu
#include <stdio.h>
#include <cuda_runtime.h>
// CUDA Kernel
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < numElements)
    {
        C[i] = A[i] + B[i];
    }
}

/**
 * Host main routine
 */
int main(void)
{
    int numElements = 15;
    size_t size = numElements * sizeof(float);
    printf("[Vector addition of %d elements]\n", numElements);

    float a[numElements],b[numElements],c[numElements];
    float *a_gpu,*b_gpu,*c_gpu;

    cudaMalloc((void **)&a_gpu, size);
    cudaMalloc((void **)&b_gpu, size);
    cudaMalloc((void **)&c_gpu, size);

    for (int i=0;i<numElements;++i ){

        a[i] = i*i;
        b[i] = i;

    }
    // Copy the host input vectors A and B in host memory to the device input vectors in
    // device memory
    printf("Copy input data from the host memory to the CUDA device\n");
    cudaMemcpy(a_gpu, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_gpu, b, size, cudaMemcpyHostToDevice);

    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(a_gpu, b_gpu, c_gpu, numElements);

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    cudaMemcpy(c, c_gpu, size, cudaMemcpyDeviceToHost);

    for (int i=0;i<numElements;++i ){
        printf("%f \n",c[i]);
    }

    // Free device global memory
    cudaFree(a_gpu);
    cudaFree(b_gpu);
    cudaFree(c_gpu);

    printf("Done\n");
    return 0;
}

Writing gpuAdd.cu

!nvcc gpuAdd.cu -o gpu

!./gpu

[Vector addition of 15 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 1 blocks of 256 threads
Copy output data from the CUDA device to the host memory
0.000000 
2.000000 
6.000000 
12.000000 
20.000000 
30.000000 
42.000000 
56.000000 
72.000000 
90.000000 
110.000000 
132.000000 
156.000000 
182.000000 
210.000000 
Done

i+iのi乗の計算(PyCUDA version 1)¶

pycuda版は以下のように非常にスマートなコードにできる。

from pycuda import autoinit
from pycuda import gpuarray
import numpy as np

aux = range(15)
a = np.array(aux).astype(np.float32)
b = (a*a).astype(np.float32)
c = np.zeros(len(aux)).astype(np.float32)

a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.to_gpu(c)
c_gpu = a_gpu+b_gpu

a_gpu,b_gpu,c_gpu

(array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
        13., 14.], dtype=float32),
 array([  0.,   1.,   4.,   9.,  16.,  25.,  36.,  49.,  64.,  81., 100.,
        121., 144., 169., 196.], dtype=float32),
 array([  0.,   2.,   6.,  12.,  20.,  30.,  42.,  56.,  72.,  90., 110.,
        132., 156., 182., 210.], dtype=float32))

PyCUDA version 2¶

from pycuda.elementwise import ElementwiseKernel

myCudaFunc = ElementwiseKernel(arguments = "float *a, float *b, float *c",
                               operation = "c[i] = a[i]+b[i]")
c_gpu.set(c)
c_gpu

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

myCudaFunc(a_gpu,b_gpu,c_gpu)
c_gpu

array([  0.,   2.,   6.,  12.,  20.,  30.,  42.,  56.,  72.,  90., 110.,
       132., 156., 182., 210.], dtype=float32)

PyCUDA version 3¶

バージョン3は最初にcuda編で作ったgpuAdd.cuファイルを読み込む。

from pycuda.compiler import SourceModule

cudaCode = open("gpuAdd.cu","r")
myCUDACode = cudaCode.read()
myCode = SourceModule(myCUDACode)

/root/.pyenv/versions/miniconda3-4.3.30/envs/caffe2/lib/python3.6/site-packages/ipykernel_launcher.py:5: UserWarning: The CUDA compiler succeeded, but said the following:
kernel.cu(17): warning: linkage specification is not allowed


  """

importedKernel = myCode.get_function("vectorAdd")

nData = len(a)
nThreadsPerBlock = 256
nBlockPerGrid = 1
nGridsPerBlock = 1

c_gpu.set(c)
c_gpu

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

importedKernel(a_gpu.gpudata,b_gpu.gpudata,c_gpu.gpudata,block=(256,1,1))
c_gpu

array([  0.,   2.,   6.,  12.,  20.,  30.,  42.,  56.,  72.,  90., 110.,
       132., 156., 182., 210.], dtype=float32)

CUDA Cに比べると、PyCUDAプログラミングは簡素化できるようだ。

参考サイトhttps://github.com/