CUDA C Programming:PyCUDAにcu fileを取り込む

今回はCUDA Cプログラム(cuファイル)をPyCUDAに関数として取り込んで使用するコードを書いてみる。

スポンサーリンク

参考サイトをgit clone

!mkdir git
cd git
/home/workspace/1/git
!git clone https://github.com/wlandau/gpu.git
Cloning into 'gpu'...
remote: Enumerating objects: 674, done.
remote: Total 674 (delta 0), reused 0 (delta 0), pack-reused 674
Receiving objects: 100% (674/674), 20.75 MiB | 4.76 MiB/s, done.
Resolving deltas: 100% (203/203), done.
Checking connectivity... done.
cd gpu
/home/workspace/1/git/gpu
ls
404.html          cudac-atomics.html   index.html        resources.html
Code/             cudac-examples.html  intro.html        short-course.html
README.md         cudac-intro.html     lectures/         talks.html
Resources/        cudac-memory.html    parallelism.html  thrust.html
_config.yml       curand.html          pointers.html     video/
_layouts/         fig/                 pycuda.html
cublas-cula.html  img/                 python.html
cd Code/CUDA_C/pairwise_sum_timed
/home/workspace/1/git/gpu/Code/CUDA_C/pairwise_sum_timed
ls
Makefile  pairwise_sum_timed.cu*

CUDA C fileをloadして実行

# %load pairwise_sum_timed.cu
#include <stdio.h> 
#include <stdlib.h> 
#include <math.h>
#include <time.h>
#include <unistd.h>
#include <cuda.h>
#include <cuda_runtime.h> 

/*
 * This program computes the sum of the elements of 
 * vector v using the pairwise (cascading) sum algorithm.
 */

#define N 1024 // length of vector v. MUST BE A POWER OF 2!!!

// Fill the vector v with n random floating point numbers.
void vfill(float* v, int n){
  int i;
  for(i = 0; i < n; i++){
    v[i] = (float) rand() / RAND_MAX;
  }
}

// Print the vector v.
void vprint(float* v, int n){
  int i;
  printf("v = \n");
  for(i = 0; i < n; i++){
    printf("%7.3f\n", v[i]);
  }
  printf("\n");
}

// Pairwise-sum the elements of vector v and store the result in v[0]. 
__global__ void psum(float *v){ 
  int t = threadIdx.x; // Thread index.
  int n = blockDim.x; // Should be half the length of v.

  while (n != 0) {
    if(t < n)
      v[t] += v[t + n];  
    __syncthreads();    
    n /= 2; 
  }
}

// Linear sum the elements of vector v and return the result
float lsum(float *v, int len){
  float s = 0;
  int i;
  for(i = 0; i < len; i++){
    s += v[i];
  }
  return s;
}

int main (void){ 
  float *v_h, *v_d; // host and device copies of our vector, respectively
  
  // dynamically allocate memory on the host for v_h
  v_h = (float*) malloc(N * sizeof(*v_h)); 
  
  // dynamically allocate memory on the device for v_d
  cudaMalloc ((float**) &v_d, N *sizeof(*v_d)); 
  
  // Fill v_h with N random floating point numbers.
  vfill(v_h, N);
  
  // Print v_h to the console
  // vprint(v_h, N);
  
  // Write the contents of v_h to v_d
  cudaMemcpy( v_d, v_h, N * sizeof(float), cudaMemcpyHostToDevice );
    
  // compute the linear sum of the elements of v_h on the CPU and return the result
  // also, time the result.
  clock_t start = clock();
  float s = lsum(v_h, N);
  
  float elapsedTime = ((float) clock() - start) / CLOCKS_PER_SEC;
  printf("Linear Sum = %7.3f, CPU Time elapsed: %f seconds\n", s, elapsedTime);
 
  // Compute the pairwise sum of the elements of v_d and store the result in v_d[0].
  // Also, time the computation.
  
  float   gpuElapsedTime;
  cudaEvent_t gpuStart, gpuStop;
  cudaEventCreate(&gpuStart);
  cudaEventCreate(&gpuStop);
  cudaEventRecord( gpuStart, 0 );

  psum<<< 1, N/2 >>>(v_d);
  
  cudaEventRecord( gpuStop, 0 );
  cudaEventSynchronize( gpuStop );
  cudaEventElapsedTime( &gpuElapsedTime, gpuStart, gpuStop ); // time in milliseconds
  cudaEventDestroy( gpuStart );
  cudaEventDestroy( gpuStop );
  
  // Write the pairwise sum, v_d[0], to v_h[0].
  cudaMemcpy(v_h, v_d, sizeof(float), cudaMemcpyDeviceToHost );
  
  // Print the pairwise sum.
  printf("Pairwise Sum = %7.3f, GPU Time elapsed: %f seconds\n", v_h[0], gpuElapsedTime/1000.0);
   
  // Free dynamically-allocated host memory
  free(v_h);

  // Free dynamically-allocated device memory    
  cudaFree(v_d);
}
!nvcc -O2 pairwise_sum_timed.cu -o pairwise_sum_timed
!./pairwise_sum_timed
Linear Sum = 518.913, CPU Time elapsed: 0.000010 seconds
Pairwise Sum = 518.913, GPU Time elapsed: 0.000039 seconds

Pycudaにcu fileを取り込む

import pycuda.autoinit
from pycuda import autoinit
from pycuda import gpuarray
import numpy as np
from pycuda.compiler import SourceModule

mod = SourceModule(open("pairwise_sum_timed.cu", "r").read())

取り込んだ関数を使う

importedKernel = mod.get_function("psum")
aux = range(10)
a = np.array(aux).astype(np.float32)
a_gpu = gpuarray.to_gpu(a)
a_gpu
array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], dtype=float32)
importedKernel(a_gpu, block=(256,1,1))
a_gpu
array([45., 25.,  8., 10.,  4.,  5.,  6.,  7.,  8.,  9.], dtype=float32)
importedKernel(a_gpu, block=(256,1,1))
a_gpu
array([127.,  56.,  14.,  17.,   4.,   5.,   6.,   7.,   8.,   9.],
      dtype=float32)
importedKernel(a_gpu, block=(256,1,1))
a_gpu
array([253.,  94.,  20.,  24.,   4.,   5.,   6.,   7.,   8.,   9.],
      dtype=float32)
importedKernel(a_gpu, block=(256,1,1))
a_gpu
array([430., 139.,  26.,  31.,   4.,   5.,   6.,   7.,   8.,   9.],
      dtype=float32)

何だかよく分からんけど、一応、cuファイルをPyCUDAに取り込むという当初の目的は達成できたので良しとしよう。