PyCUDAプログラミング学習にはCUDA Cプログラミング学習が必須のようだ。GPUプログラミングは敷居が高そうだが、面白そうではある。

昨日のコードのおさらい¶

昨日のコードは今見てもHello world!(超初歩)の割にかなり複雑なように思える。

from pycuda.tools import make_default_context

c = make_default_context()
d = c.get_device()
d.name()

'GeForce GTX 1060 with Max-Q Design'

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy

a = numpy.random.randn(4,4) # 4x4の乱数アレイを作る
a = a.astype(numpy.float32) # 倍精度数を単精度数に変換
a_gpu = cuda.mem_alloc(a.nbytes) # 転送先GPUにメモリを割り当てる
cuda.memcpy_htod(a_gpu, a) # データをGPUメモリに転送する

a.dtype # aのデータタイプを確認する。

dtype('float32')

a_gpu

<pycuda._driver.DeviceAllocation at 0x7f25c08be940>

# 入力データを２倍にするカーネル関数をCUDA Cで作成
mod = SourceModule("""
  __global__ void doublify(float *a)
  {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] *= 2;
  }
  """)

func = mod.get_function("doublify")
func(a_gpu, block=(4,4,1))

a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print (a) # 単精度(float32)乱数の入力データ
print (a_doubled) # input dataを２倍にした値

[[-0.25245064  1.4856048   0.8485535   1.0885941 ]
 [-1.0230184  -0.08208965 -0.3889643   1.1710126 ]
 [ 1.2860265  -2.2667894  -0.10706756  1.2155651 ]
 [-0.26334605  1.0229876  -1.1091733   0.617082  ]]
[[-0.5049013   2.9712095   1.697107    2.1771882 ]
 [-2.0460367  -0.1641793  -0.7779286   2.3420253 ]
 [ 2.572053   -4.533579   -0.21413513  2.4311302 ]
 [-0.5266921   2.0459752  -2.2183466   1.234164  ]]

より簡潔なPyCUDAコーディング¶

pycuda.gpuarray.GPUArrayを使うことで簡潔なコードに書き換え可能。

import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy

a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32))
a_doubled = (2*a_gpu).get()
print (a_doubled)
print (a_gpu)

[[ 4.459692    1.1557872   2.969427   -3.742111  ]
 [-1.7813717  -2.468989   -2.1829033  -3.6996453 ]
 [ 2.6310894  -1.3522909   1.6362696   0.35893446]
 [-0.81718874 -1.3393584   0.5271593  -0.01239202]]
[[ 2.229846    0.5778936   1.4847136  -1.8710555 ]
 [-0.89068586 -1.2344944  -1.0914516  -1.8498226 ]
 [ 1.3155447  -0.67614543  0.8181348   0.17946723]
 [-0.40859437 -0.6696792   0.26357964 -0.00619601]]

こんなに簡潔なコードに書き換えられることにびっくりした。こっちの方がHello world!プログラムにふさわしいように思う。

Advanced Topics(アドバンスドトピックス)¶

Structures (ストラクチャーズ)¶

mod = SourceModule("""
    struct DoubleOperation {
        int datalen, __padding; // so 64-bit ptrs can be aligned
        float *ptr;
    };

    __global__ void double_array(DoubleOperation *a) {
        a = &a[blockIdx.x];
        for (int idx = threadIdx.x; idx < a->datalen; idx += blockDim.x) {
            a->ptr[idx] *= 2;
        }
    }
    """)

class DoubleOpStruct:
    mem_size = 8 + numpy.intp(0).nbytes
    def __init__(self, array, struct_arr_ptr):
        self.data = cuda.to_device(array)
        self.shape, self.dtype = array.shape, array.dtype
        cuda.memcpy_htod(int(struct_arr_ptr), numpy.getbuffer(numpy.int32(array.size)))
        cuda.memcpy_htod(int(struct_arr_ptr) + 8, numpy.getbuffer(numpy.intp(int(self.data))))
    def __str__(self):
        return str(cuda.from_device(self.data, self.shape, self.dtype))

struct_arr = cuda.mem_alloc(2 * DoubleOpStruct.mem_size)
do2_ptr = int(struct_arr) + DoubleOpStruct.mem_size

array1 = DoubleOpStruct(numpy.array([1, 2, 3], dtype=numpy.float32), struct_arr)
array2 = DoubleOpStruct(numpy.array([0, 4], dtype=numpy.float32), do2_ptr)
print("original arrays", array1, array2)


AttributeErrorTraceback (most recent call last)
<ipython-input-11-904cbd6f145a> in <module>()
     12 do2_ptr = int(struct_arr) + DoubleOpStruct.mem_size
     13 
---> 14 array1 = DoubleOpStruct(numpy.array([1, 2, 3], dtype=numpy.float32), struct_arr)
     15 array2 = DoubleOpStruct(numpy.array([0, 4], dtype=numpy.float32), do2_ptr)
     16 print("original arrays", array1, array2)

<ipython-input-11-904cbd6f145a> in __init__(self, array, struct_arr_ptr)
      4         self.data = cuda.to_device(array)
      5         self.shape, self.dtype = array.shape, array.dtype
----> 6         cuda.memcpy_htod(int(struct_arr_ptr), numpy.getbuffer(numpy.int32(array.size)))
      7         cuda.memcpy_htod(int(struct_arr_ptr) + 8, numpy.getbuffer(numpy.intp(int(self.data))))
      8     def __str__(self):

AttributeError: module 'numpy' has no attribute 'getbuffer'

numpy.getbufferはpython3でmemoryviewに取って代わられたらしい。

class DoubleOpStruct:
    mem_size = 8 + numpy.intp(0).nbytes
    def __init__(self, array, struct_arr_ptr):
        self.data = cuda.to_device(array)
        self.shape, self.dtype = array.shape, array.dtype
        cuda.memcpy_htod(int(struct_arr_ptr), memoryview(numpy.int32(array.size)))
        cuda.memcpy_htod(int(struct_arr_ptr) + 8, memoryview(numpy.intp(int(self.data))))
    def __str__(self):
        return str(cuda.from_device(self.data, self.shape, self.dtype))

struct_arr = cuda.mem_alloc(2 * DoubleOpStruct.mem_size)
do2_ptr = int(struct_arr) + DoubleOpStruct.mem_size

array1 = DoubleOpStruct(numpy.array([1, 2, 3], dtype=numpy.float32), struct_arr)
array2 = DoubleOpStruct(numpy.array([0, 4], dtype=numpy.float32), do2_ptr)
print("original arrays", array1, array2)

original arrays [1. 2. 3.] [0. 4.]

func = mod.get_function("double_array")
func(struct_arr, block = (32, 1, 1), grid=(2, 1))
print("doubled arrays", array1, array2)

func(numpy.intp(do2_ptr), block = (32, 1, 1), grid=(1, 1))
print("doubled second only", array1, array2, "\n")

doubled arrays [2. 4. 6.] [0. 8.]
doubled second only [2. 4. 6.] [ 0. 16.]

チュートリアルはここで終わってしまっている。後は、pycuda githubのサンプルコードやテストコードを見て学習するように推奨している。

参考サイトhttps://documen.tician.de

昨日のコードのおさらい¶

より簡潔なPyCUDAコーディング¶

Advanced Topics(アドバンスド トピックス)¶

Structures (ストラクチャーズ)¶

Advanced Topics(アドバンスドトピックス)¶