PyCUDAのtemplate metaprogrammingバージョンのコードを、ElementwiseKernelバージョンのコードに書き換える。ElementwiseKernelは非常に便利なkernelだと思う。

Templateバージョン¶

このサイトからコードを引っ張ってくる。

#!/usr/bin/env python

"""
Demonstrates how to access 2D arrays within a PyCUDA kernel in a
numpy-consistent manner.
"""
from __future__ import print_function

from string import Template
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np

import skcuda.misc as misc

A = 3
B = 4
N = A * B

# Define a 2D array:
# x_orig = np.arange(0, N, 1, np.float64)
x_orig = np.asarray(np.random.rand(N), np.float64)
x = x_orig.reshape((A, B))

# These functions demonstrate how to convert a linear index into subscripts:
a = lambda i: i // B
b = lambda i: np.mod(i, B)

# Check that x[subscript(i)] is equivalent to x.flat[i]:
subscript = lambda i: (a(i), b(i))
for i in range(x.size):
    assert x.flat[i] == x[subscript(i)]

# Check that x[i, j] is equivalent to x.flat[index(i, j)]:
index = lambda i, j: i * B + j
for i in range(A):
    for j in range(B):
        assert x[i, j] == x.flat[index(i, j)]

func_mod_template = Template("""
// Macro for converting subscripts to linear index:
#define INDEX(a, b) a*${B}+b
__global__ void func(double *x, unsigned int N) {
    // Obtain the linear index corresponding to the current thread:
    unsigned int idx = blockIdx.y*${max_threads_per_block}*${max_blocks_per_grid}+
                       blockIdx.x*${max_threads_per_block}+threadIdx.x;
    // Convert the linear index to subscripts:
    unsigned int a = idx/${B};
    unsigned int b = idx%${B};
    // Use the subscripts to access the array:
    if (idx < N) {
        if (b == 0)
           x[INDEX(a,b)] = 100;
    }
}
""")

max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs(pycuda.autoinit.device)
block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape)
max_blocks_per_grid = max(max_grid_dim)

func_mod = \
    SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block,
                                              max_blocks_per_grid=max_blocks_per_grid,
                                              A=A, B=B))
func = func_mod.get_function('func')
x_gpu = gpuarray.to_gpu(x)
func(x_gpu.gpudata, np.uint32(x_gpu.size),
     block=block_dim,
     grid=grid_dim)
x_np = x.copy()
x_np[:, 0] = 100

print('Success status: %r' % np.allclose(x_np, x_gpu.get()))

Success status: True

ElementwiseKernelバージョン¶

上のコードをこのサイトを参考にしてElementwiseKernelを使って書き換える。

#!/usr/bin/env python

"""
Demonstrates how to access 2D arrays within a PyCUDA kernel in a
numpy-consistent manner.
"""
from __future__ import print_function
from pycuda.elementwise import ElementwiseKernel
from string import Template
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.misc as misc
import matplotlib.pyplot as plt

A = 2**3
B = 2**3
N = A * B

# Define a 2D array:
# x_orig = np.arange(0, N, 1, np.float64)
x_orig = np.asarray(np.random.rand(N), np.float64)
x = x_orig.reshape((A, B))

# These functions demonstrate how to convert a linear index into subscripts:
a = lambda i: i // B
b = lambda i: np.mod(i, B)

# Check that x[subscript(i)] is equivalent to x.flat[i]:
subscript = lambda i: (a(i), b(i))
for i in range(x.size):
    assert x.flat[i] == x[subscript(i)]

# Check that x[i, j] is equivalent to x.flat[index(i, j)]:
index = lambda i, j: i * B + j
for i in range(A):
    for j in range(B):
        assert x[i, j] == x.flat[index(i, j)]

func = ElementwiseKernel(
    "double *x, unsigned int N",
    
    Template("""
    #define INDEX(a, b) a*${B}+b
    // Convert the linear index to subscripts:
    unsigned int a = i/${B};
    unsigned int b = i%${B};
    // Use the subscripts to access the array:
    if (i < N && b == 0) {
           x[INDEX(a,b)] = 100;}""").substitute(B=B))

x_gpu = gpuarray.to_gpu(x)
func(x_gpu,  x_gpu.size)
x_np = x.copy()
x_np[:, 0] = 100

print('Success status: %r' % np.allclose(x_np, x_gpu.get()))

plt.imshow((x_gpu.size/x_gpu.get()).real)
plt.colorbar()
plt.show()

Success status: True

ElementwiseKernelを使うと非常にスッキリしたコードになる。