気が付いたらPyCudaがいつの間にかPython3.7に対応していた。仕事が早くてびっくりした。python3.7に対応していないというクレームが付いて、boostに問題があるとかなんとかいう話になって、pybind11を使えば何とかなるかもしれないとかいう返答を聞いた時は、これは来年までお預けだなと思っていたが、あっさり解決されていた。
!pip3 install git+https://github.com/inducer/pycuda.git
試しに適当なコードを走らせてみる。
import time
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
import math
# import matplotlib.pyplot as plt
from sys import getsizeof
# -- initialize the device
import pycuda.autoinit
# -----------------------------------------------------
# CUDA parameters
kernel_code_template = """
__global__ void MatProd(float* C, float* A, float* B, int dimAx, int dimBx, int dimCx, int dimCy)
{
int row = blockDim.y*blockIdx.y+threadIdx.y;
int col = blockDim.x*blockIdx.x+threadIdx.x;
double Result = 0;
if (row<=dimCy-1 && col<=dimCx-1)
{
for (int k = 0; k < dimAx; k++)
{
Result += A[k + dimAx*row] * B[col + dimBx*k];
}
C[col + row*dimCx] = Result;
}
}
"""
# get the kernel code from the template
kernel_code=kernel_code_template
# compile the kernel code
mod = compiler.SourceModule(kernel_code)
# get the kernel function from the compiled module
MatProd = mod.get_function("MatProd")
warp_size=32 # Warp size on the GPU.
# --------------------------------------------------------------------
# --------------------BEGIN of INITIALISATION-------------------------
# --------------------------------------------------------------------
# We create the python matrices for the computation C=A*B
# This part is supposed as an input, so we don't take in account any computation
# time here.
nb_columnsA=1024
nb_linesA=1024
nb_columnsB=1024
nb_linesB=nb_columnsA
a_cpu=np.random.rand(nb_linesA,nb_columnsA).astype(np.float32)
b_cpu=np.random.rand(nb_linesB,nb_columnsB).astype(np.float32)
# --------------------------------------------------------------------
# --------------------End of INITIALISATION---------------------------
# --------------------------------------------------------------------
# --------------------------------------------------------------------
# --------------------CUDA PART---------------------------------------
# --------------------------------------------------------------------
# We send the data to the GPU
total_CUDA_time_Begin=time.clock()
time_memory_alloc_GPU_Begin=time.clock()
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu=gpuarray.to_gpu(b_cpu)
# We allocate the memory on the GPU for the result C=A*B
c_gpu = gpuarray.empty((nb_linesA, nb_columnsB), np.float32)
time_memory_alloc_GPU_End=time.clock()
# ----------------------------------------------------------
# Starting of the CUDA computation :
# We reserve the number of threads per block on the memory
threadPerBlockx=warp_size
threadPerBlocky=warp_size
# We reserve a number of block on the memory.
size_Cx = nb_columnsB
size_Cy = nb_linesA
BlockPerGridx = (int) (1 + (size_Cx - 1) // threadPerBlockx);
BlockPerGridy = (int) (1 + (size_Cy - 1) // threadPerBlockx);
time_computation_CUDA_Begin=time.clock()
MatProd(
# output
c_gpu,
# inputs
a_gpu, b_gpu,
np.int32(nb_columnsA),np.int32(nb_columnsB),np.int32(nb_columnsB),np.int32(nb_linesA),
# (only one) block of MATRIX_SIZE x MATRIX_SIZE threads
block = (threadPerBlockx, threadPerBlocky, 1), grid=(BlockPerGridx,BlockPerGridy)
)
driver.Context.synchronize()
time_computation_CUDA_End=time.clock()
time_memory_get_result_GPU_Begin=time.clock()
c_gpu_result=c_gpu.get() # We get the result
time_memory_get_result_GPU_End=time.clock()
total_CUDA_time_End=time.clock()
# --------------------------------------------------------------------
# --------------------END OF CUDA PART--------------------------------
# --------------------------------------------------------------------
# --------------------------------------------------------------------
# --------------------PYTHON PART-------------------------------------
# --------------------------------------------------------------------
# We compute in python :
total_Python_time_Begin=time.clock()
c_cpu=np.empty([nb_linesA,nb_columnsB]).astype(np.float32)
time_computation_Python_Begin=time.clock()
c_cpu=np.dot(a_cpu,b_cpu)
time_computation_Python_End=time.clock()
total_Python_time_End=time.clock()
# --------------------------------------------------------------------
# --------------------END OF PYTHON PART------------------------------
# --------------------------------------------------------------------
#------------------------------------------------------------
# We display the execution times :
# Computation times :
time_computation_CUDA=time_computation_CUDA_End-time_computation_CUDA_Begin
time_computation_Python=time_computation_Python_End-time_computation_Python_Begin
print("CUDA pure computation time : ", time_computation_CUDA)
print("Python pure computation time : ", time_computation_Python)
print(" ")
# Memory allocation times :
time_memory_alloc_GPU=time_memory_alloc_GPU_End-time_memory_alloc_GPU_Begin
time_memory_get_result_GPU=time_memory_get_result_GPU_End-time_memory_get_result_GPU_Begin
print("CUDA memory allocation time (allocating C, transferring A,B from CPU to GPU):", time_memory_alloc_GPU)
print("CUDA getting result from GPU (Pulling back C from GPU to CPU after computation) :", time_memory_get_result_GPU)
# Total time (computation + memory allocation)
print(" ")
total_CUDA_time=total_CUDA_time_End-total_CUDA_time_Begin
total_Python_time=total_Python_time_End-total_Python_time_Begin
print("CUDA total time (alloc C + A to gpu + B to gpu + comput + get result) :", total_CUDA_time)
print("Python total time (comput + alloc C) :", total_Python_time)
!nvcc --version
%load_ext version_information
import torch, pymetis, skcuda
%version_information torch, pycuda, pymetis, scikit-cuda
スポンサーリンク
スポンサーリンク