PyCUDA, Numba等のラプラシアンフィルタ処理速度比較

今回はこのサイトのコードを拝借して、numpy, cython, pythran, numba, numba guvectorize, pycudaのlaplacian filter(ラプラシアンフィルタ)処理速度比較をしてみる。

スポンサーリンク

Introducing the discrete Laplacian

先ずは必要なPythonモジュールをインポートする。

import skimage.data
import skimage.color
from skimage.filters import laplace
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

画像データをダウンロードする。

image = skimage.data.astronaut()
image = skimage.color.rgb2gray(image)
image.shape
(512, 512)
image.dtype
dtype('float64')

scikit-image implementation

def laplace_skimage(image):
    """Applies Laplace operator to 2D image using skimage implementation. 
    Then tresholds the result and returns boolean image."""
    laplacian = laplace(image)
    thresh = np.abs(laplacian) > 0.05
    return thresh
edges = laplace_skimage(image)
edges.shape
(512, 512)
%timeit laplace_skimage(image)
2.58 ms ± 113 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
def compare(left, right):
    """Compares two images, left and right."""
    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
    ax[0].imshow(left, cmap='gray')
    ax[1].imshow(right, cmap='gray')
compare(left=image, right=edges)

NumPy implementation

def laplace_numpy(image):
    """Applies Laplace operator to 2D image using our own NumPy implementation. 
    Then tresholds the result and returns boolean image."""
    laplacian = image[:-2, 1:-1] + image[2:, 1:-1] + image[1:-1, :-2] + image[1:-1, 2:] - 4*image[1:-1, 1:-1]
    thresh = np.abs(laplacian) > 0.05
    return thresh
laplace_numpy(image).shape
(510, 510)
%timeit laplace_numpy(image)
502 µs ± 4.24 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
compare(image, laplace_numpy(image))
np.allclose(laplace_skimage(image)[1:-1, 1:-1], laplace_numpy(image))
True

Cython

%load_ext cython
%%cython
import numpy as np
   
def laplace_cython(image):
    """Applies Laplace operator to 2D image, then tresholds the result and returns boolean image.
    Cython implementation."""
    laplacian = image[:-2, 1:-1] + image[2:, 1:-1] + image[1:-1, :-2] + image[1:-1, 2:] - 4*image[1:-1, 1:-1]
    thresh = np.abs(laplacian) > 0.05
    return thresh
%timeit laplace_cython(image)
1.26 ms ± 7.62 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
np.allclose(laplace_numpy(image), laplace_cython(image))
True
%%cython
import numpy as np
cimport numpy as cnp

def laplace_cython(cnp.ndarray image):
    """Applies Laplace operator to 2D image, then tresholds the result and returns boolean image.
    Cython implementation."""
    laplacian = image[:-2, 1:-1] + image[2:, 1:-1] + image[1:-1, :-2] + image[1:-1, 2:] - 4*image[1:-1, 1:-1]
    thresh = np.abs(laplacian) > 0.05
    return thresh
%timeit laplace_cython(image)
1.26 ms ± 10.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
np.allclose(laplace_numpy(image), laplace_cython(image))
True
%%cython
import numpy as np
cimport numpy as cnp
import cython

@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
def laplace_cython(cnp.ndarray[double, ndim=2] image):
    """Applies Laplace operator to 2D image, then tresholds the result and returns boolean image.
    Cython implementation."""
    cdef int h = image.shape[0]
    cdef int w = image.shape[1]
    cdef cnp.ndarray[double, ndim=2] laplacian = np.empty((w-2, h-2), dtype=np.double)
    cdef int i, j
    for i in range(1, h-1):
        for j in range(1, w-1):
            laplacian[i-1, j-1] = image[i-1, j] + image[i+1, j] + image[i, j-1] + image[i, j+1] - 4*image[i, j]
    thresh = np.abs(laplacian) > 0.05
    return thresh
%timeit laplace_cython(image)
493 µs ± 1.78 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
np.allclose(laplace_numpy(image), laplace_cython(image))
True

Pythran

%load_ext pythran.magic
%%pythran
#pythran export laplace_pythran_highlevel(float[][])
import numpy as np
def laplace_pythran_highlevel(image):
    """Laplace operator in NumPy for 2D images. Pythran accelerated."""
    laplacian = image[:-2, 1:-1] + image[2:, 1:-1] + image[1:-1, :-2] + image[1:-1, 2:] - 4*image[1:-1, 1:-1]
    thresh = np.abs(laplacian) > 0.05
    return thresh
np.allclose(laplace_numpy(image), laplace_pythran_highlevel(image))
True
%timeit laplace_pythran_highlevel(image)
326 µs ± 2.72 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

Numba

from numba import jit
@jit
def laplace_numba(image):
    """Laplace operator in NumPy for 2D images. Accelerated using numba."""
    laplacian = image[:-2, 1:-1] + image[2:, 1:-1] + image[1:-1, :-2] + image[1:-1, 2:] - 4*image[1:-1, 1:-1]
    thresh = np.abs(laplacian) > 0.05
    return thresh
laplace_numba(image);
%timeit laplace_numba(image)
1.3 ms ± 6.33 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
@jit(nopython=True, fastmath = True, parallel = True, nogil = True)
def laplace_numba(image):
    """Laplace operator for 2D images. Numba accelerated."""
    h = image.shape[0]
    w = image.shape[1]
    laplacian = np.empty((h - 2, w - 2))
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i-1, j-1] = np.abs(image[i-1, j] + image[i+1, j] + image[i, j-1] + image[i, j+1] - 4*image[i, j]) > 0.05
    return laplacian
laplace_numba(image);
%timeit laplace_numba(image)
150 µs ± 1.64 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
from numba import njit
@njit(fastmath = True, parallel = True, nogil = True)
def laplace_numba(image):
    """Laplace operator for 2D images. Numba accelerated."""
    h = image.shape[0]
    w = image.shape[1]
    laplacian = np.empty((h - 2, w - 2))
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i-1, j-1] = np.abs(image[i-1, j] + image[i+1, j] + image[i, j-1] + image[i, j+1] - 4*image[i, j]) > 0.05
    return laplacian
laplace_numba(image);
%timeit laplace_numba(image)
154 µs ± 3.3 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
np.allclose(laplace_numpy(image), laplace_numba(image))
True

numba guvectorize

from numba import guvectorize
@guvectorize('void(float64[:, :], float64[:, :])', "(m, n)->(m, n)", \
           nopython=True, fastmath = True)
def laplace_numba_guvectorize(image, laplacian):
    """Laplace operator in NumPy for 2D images. Numba accelerated."""
    h = image.shape[0]
    w = image.shape[1]
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i-1, j-1] = np.abs(4 * image[i, j] - image[i - 1, j] - \
               image[i + 1, j] - image[i, j + 1] - image[i, j - 1]) > 0.05
laplacian = np.empty_like(image)
laplace_numba_guvectorize(image, laplacian);
np.allclose(laplace_numpy(image), laplacian[:-2, :-2])
True
%timeit laplace_numba_guvectorize(image, laplacian);
368 µs ± 4.56 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
from numba import guvectorize
import math
@guvectorize('void(float64[:, :], float64[:, :])', "(m, n)->(m, n)", \
             target='parallel',nopython=True, fastmath = True)
def laplace_numba_guvectorize(image, laplacian):
    """Laplace operator in NumPy for 2D images. Numba accelerated."""
    h = image.shape[0]
    w = image.shape[1]
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i-1, j-1] = abs(4 * image[i, j] - image[i - 1, j] - \
               image[i + 1, j] - image[i, j + 1] - image[i, j - 1]) > 0.05
laplacian = np.empty_like(image)
laplace_numba_guvectorize(image, laplacian);
np.allclose(laplace_numpy(image), laplacian[:-2, :-2])
True
%timeit laplace_numba_guvectorize(image, laplacian);
401 µs ± 3.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

PyCUDA

import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule
import matplotlib.pyplot as p

image = image.astype(np.float32)
(h,w)=image.shape
print (h,w)

mod_copy_texture=SourceModule(
"""
#include <cmath>
texture<float,2>tex;
__global__ void  copy_texture_kernel(float *C,float * data)
 {
  int i = threadIdx.x+(blockIdx.x*(blockDim.x));
  int j = threadIdx.y+(blockIdx.y*(blockDim.y));
  int h=C[0];
  int w=C[1];
  while(i<w)
  {
  while(j<h)
  {
  data[i+w*j] = abs(4*tex2D(tex,j,i)-tex2D(tex,j-1,i)-tex2D(tex,j+1,i)\
  -tex2D(tex,j,i+1)-tex2D(tex,j,i-1)) > 0.05;
  __syncthreads();
  j += blockDim.y * gridDim.y;
  }
  i += blockDim.x * gridDim.x;
  }
}
""")
copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel")
texref = mod_copy_texture.get_texref("tex")
cuda.matrix_to_texref(image , texref , order = "F")
gpu_output = np.empty_like(image)
copy_texture_func(cuda.In(np.float32([h,w])),cuda.Out(gpu_output),\
          block=(32,32,1), grid=(h//32,w//32,1), texrefs=[texref])
512 512
%timeit copy_texture_func(cuda.In(np.float32([h,w])),cuda.Out(gpu_output),\
                  block=(32,16, 1), grid=(h//32,w//32,1), texrefs=[texref])
650 µs ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
compare(gpu_output, laplace_numpy(image))

Wrap-up and plots

timings = {}
for func in [laplace_skimage, laplace_numpy, laplace_cython, laplace_pythran_highlevel, laplace_numba]:
    t = %timeit -o func(image)
    timings[func.__name__] = t
2.33 ms ± 71.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1.13 ms ± 21 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
514 µs ± 950 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
327 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
154 µs ± 2.99 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
t = %timeit -o laplace_numba_guvectorize(image, laplacian);
timings['laplace_numba_guvectorize'] = t
362 µs ± 4.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
t = %timeit -o copy_texture_func(cuda.In(np.float32([h,w])),cuda.Out(gpu_output),\
                  block=(32,16, 1), grid=(h//32,w//32,1), texrefs=[texref]);
timings['laplace_pycuda'] = t
636 µs ± 3.47 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
import pandas as pd

pd.Series({key: timings[key].average * 1e6 for key in timings}).to_frame(name='timings (μs)').sort_values(by='timings (μs)')
timings (μs)
laplace_numba 153.610144
laplace_pythran_highlevel 326.623936
laplace_numba_guvectorize 361.773459
laplace_cython 514.308131
laplace_pycuda 635.745086
laplace_numpy 1130.314359
laplace_skimage 2326.259981
fig, ax = plt.subplots(figsize=(20, 12))
plt.rcParams["font.size"] = "30"
pd.Series({key: timings[key].average * 1e6 for key in timings}).to_frame(name='timings (μs)').sort_values(by='timings (μs)').plot(kind='barh', ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x7fa79c53ca90>

pycudaの速度が思ったより芳しくなかったが、これはデータ量が少ないことに起因している。画像サイズがでかいとpycudaが一番速くなる。

参考サイトhttps://github.com/