PyCUDAのtemplate metaprogrammingバージョンのコードを、ElementwiseKernelバージョンのコードに書き換える。ElementwiseKernelは非常に便利なkernelだと思う。
スポンサーリンク
Templateバージョン¶
このサイトからコードを引っ張ってくる。
#!/usr/bin/env python
"""
Demonstrates how to access 2D arrays within a PyCUDA kernel in a
numpy-consistent manner.
"""
from __future__ import print_function
from string import Template
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np
import skcuda.misc as misc
A = 3
B = 4
N = A * B
# Define a 2D array:
# x_orig = np.arange(0, N, 1, np.float64)
x_orig = np.asarray(np.random.rand(N), np.float64)
x = x_orig.reshape((A, B))
# These functions demonstrate how to convert a linear index into subscripts:
a = lambda i: i // B
b = lambda i: np.mod(i, B)
# Check that x[subscript(i)] is equivalent to x.flat[i]:
subscript = lambda i: (a(i), b(i))
for i in range(x.size):
assert x.flat[i] == x[subscript(i)]
# Check that x[i, j] is equivalent to x.flat[index(i, j)]:
index = lambda i, j: i * B + j
for i in range(A):
for j in range(B):
assert x[i, j] == x.flat[index(i, j)]
func_mod_template = Template("""
// Macro for converting subscripts to linear index:
#define INDEX(a, b) a*${B}+b
__global__ void func(double *x, unsigned int N) {
// Obtain the linear index corresponding to the current thread:
unsigned int idx = blockIdx.y*${max_threads_per_block}*${max_blocks_per_grid}+
blockIdx.x*${max_threads_per_block}+threadIdx.x;
// Convert the linear index to subscripts:
unsigned int a = idx/${B};
unsigned int b = idx%${B};
// Use the subscripts to access the array:
if (idx < N) {
if (b == 0)
x[INDEX(a,b)] = 100;
}
}
""")
max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs(pycuda.autoinit.device)
block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape)
max_blocks_per_grid = max(max_grid_dim)
func_mod = \
SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block,
max_blocks_per_grid=max_blocks_per_grid,
A=A, B=B))
func = func_mod.get_function('func')
x_gpu = gpuarray.to_gpu(x)
func(x_gpu.gpudata, np.uint32(x_gpu.size),
block=block_dim,
grid=grid_dim)
x_np = x.copy()
x_np[:, 0] = 100
print('Success status: %r' % np.allclose(x_np, x_gpu.get()))
スポンサーリンク
ElementwiseKernelバージョン¶
上のコードをこのサイトを参考にしてElementwiseKernelを使って書き換える。
#!/usr/bin/env python
"""
Demonstrates how to access 2D arrays within a PyCUDA kernel in a
numpy-consistent manner.
"""
from __future__ import print_function
from pycuda.elementwise import ElementwiseKernel
from string import Template
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np
import skcuda.misc as misc
import matplotlib.pyplot as plt
A = 2**3
B = 2**3
N = A * B
# Define a 2D array:
# x_orig = np.arange(0, N, 1, np.float64)
x_orig = np.asarray(np.random.rand(N), np.float64)
x = x_orig.reshape((A, B))
# These functions demonstrate how to convert a linear index into subscripts:
a = lambda i: i // B
b = lambda i: np.mod(i, B)
# Check that x[subscript(i)] is equivalent to x.flat[i]:
subscript = lambda i: (a(i), b(i))
for i in range(x.size):
assert x.flat[i] == x[subscript(i)]
# Check that x[i, j] is equivalent to x.flat[index(i, j)]:
index = lambda i, j: i * B + j
for i in range(A):
for j in range(B):
assert x[i, j] == x.flat[index(i, j)]
func = ElementwiseKernel(
"double *x, unsigned int N",
Template("""
#define INDEX(a, b) a*${B}+b
// Convert the linear index to subscripts:
unsigned int a = i/${B};
unsigned int b = i%${B};
// Use the subscripts to access the array:
if (i < N && b == 0) {
x[INDEX(a,b)] = 100;}""").substitute(B=B))
x_gpu = gpuarray.to_gpu(x)
func(x_gpu, x_gpu.size)
x_np = x.copy()
x_np[:, 0] = 100
print('Success status: %r' % np.allclose(x_np, x_gpu.get()))
plt.imshow((x_gpu.size/x_gpu.get()).real)
plt.colorbar()
plt.show()
ElementwiseKernelを使うと非常にスッキリしたコードになる。
スポンサーリンク
スポンサーリンク