今回はこのサイトのコードを拝借して、numpy, cython, pythran, numba, numba guvectorize, pycudaのlaplacian filter(ラプラシアンフィルタ)処理速度比較をしてみる。

Introducing the discrete Laplacian¶

先ずは必要なPythonモジュールをインポートする。

import skimage.data
import skimage.color
from skimage.filters import laplace
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

画像データをダウンロードする。

image = skimage.data.astronaut()
image = skimage.color.rgb2gray(image)
image.shape

(512, 512)

image.dtype

dtype('float64')

scikit-image implementation¶

def laplace_skimage(image):
    """Applies Laplace operator to 2D image using skimage implementation. 
    Then tresholds the result and returns boolean image."""
    laplacian = laplace(image)
    thresh = np.abs(laplacian) > 0.05
    return thresh

edges = laplace_skimage(image)
edges.shape

(512, 512)

%timeit laplace_skimage(image)

2.58 ms ± 113 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

def compare(left, right):
    """Compares two images, left and right."""
    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
    ax[0].imshow(left, cmap='gray')
    ax[1].imshow(right, cmap='gray')

compare(left=image, right=edges)

NumPy implementation¶

def laplace_numpy(image):
    """Applies Laplace operator to 2D image using our own NumPy implementation. 
    Then tresholds the result and returns boolean image."""
    laplacian = image[:-2, 1:-1] + image[2:, 1:-1] + image[1:-1, :-2] + image[1:-1, 2:] - 4*image[1:-1, 1:-1]
    thresh = np.abs(laplacian) > 0.05
    return thresh

laplace_numpy(image).shape

(510, 510)

%timeit laplace_numpy(image)

502 µs ± 4.24 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

compare(image, laplace_numpy(image))

np.allclose(laplace_skimage(image)[1:-1, 1:-1], laplace_numpy(image))

True

Cython¶

%load_ext cython

%%cython
import numpy as np
   
def laplace_cython(image):
    """Applies Laplace operator to 2D image, then tresholds the result and returns boolean image.
    Cython implementation."""
    laplacian = image[:-2, 1:-1] + image[2:, 1:-1] + image[1:-1, :-2] + image[1:-1, 2:] - 4*image[1:-1, 1:-1]
    thresh = np.abs(laplacian) > 0.05
    return thresh

%timeit laplace_cython(image)

1.26 ms ± 7.62 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

np.allclose(laplace_numpy(image), laplace_cython(image))

True

%%cython
import numpy as np
cimport numpy as cnp

def laplace_cython(cnp.ndarray image):
    """Applies Laplace operator to 2D image, then tresholds the result and returns boolean image.
    Cython implementation."""
    laplacian = image[:-2, 1:-1] + image[2:, 1:-1] + image[1:-1, :-2] + image[1:-1, 2:] - 4*image[1:-1, 1:-1]
    thresh = np.abs(laplacian) > 0.05
    return thresh

%timeit laplace_cython(image)

1.26 ms ± 10.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

np.allclose(laplace_numpy(image), laplace_cython(image))

True

%%cython
import numpy as np
cimport numpy as cnp
import cython

@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
def laplace_cython(cnp.ndarray[double, ndim=2] image):
    """Applies Laplace operator to 2D image, then tresholds the result and returns boolean image.
    Cython implementation."""
    cdef int h = image.shape[0]
    cdef int w = image.shape[1]
    cdef cnp.ndarray[double, ndim=2] laplacian = np.empty((w-2, h-2), dtype=np.double)
    cdef int i, j
    for i in range(1, h-1):
        for j in range(1, w-1):
            laplacian[i-1, j-1] = image[i-1, j] + image[i+1, j] + image[i, j-1] + image[i, j+1] - 4*image[i, j]
    thresh = np.abs(laplacian) > 0.05
    return thresh

%timeit laplace_cython(image)

493 µs ± 1.78 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

np.allclose(laplace_numpy(image), laplace_cython(image))

True

Pythran¶

%load_ext pythran.magic

%%pythran
#pythran export laplace_pythran_highlevel(float[][])
import numpy as np
def laplace_pythran_highlevel(image):
    """Laplace operator in NumPy for 2D images. Pythran accelerated."""
    laplacian = image[:-2, 1:-1] + image[2:, 1:-1] + image¹ + image² - 4*image³
    thresh = np.abs(laplacian) > 0.05
    return thresh

np.allclose(laplace_numpy(image), laplace_pythran_highlevel(image))

True

%timeit laplace_pythran_highlevel(image)

326 µs ± 2.72 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

Numba¶

from numba import jit

@jit
def laplace_numba(image):
    """Laplace operator in NumPy for 2D images. Accelerated using numba."""
    laplacian = image[:-2, 1:-1] + image[2:, 1:-1] + image[1:-1, :-2] + image[1:-1, 2:] - 4*image[1:-1, 1:-1]
    thresh = np.abs(laplacian) > 0.05
    return thresh

laplace_numba(image);

%timeit laplace_numba(image)

1.3 ms ± 6.33 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

@jit(nopython=True, fastmath = True, parallel = True, nogil = True)
def laplace_numba(image):
    """Laplace operator for 2D images. Numba accelerated."""
    h = image.shape[0]
    w = image.shape[1]
    laplacian = np.empty((h - 2, w - 2))
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i-1, j-1] = np.abs(image[i-1, j] + image[i+1, j] + image[i, j-1] + image[i, j+1] - 4*image[i, j]) > 0.05
    return laplacian

laplace_numba(image);

%timeit laplace_numba(image)

150 µs ± 1.64 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

from numba import njit
@njit(fastmath = True, parallel = True, nogil = True)
def laplace_numba(image):
    """Laplace operator for 2D images. Numba accelerated."""
    h = image.shape[0]
    w = image.shape[1]
    laplacian = np.empty((h - 2, w - 2))
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i-1, j-1] = np.abs(image[i-1, j] + image[i+1, j] + image[i, j-1] + image[i, j+1] - 4*image[i, j]) > 0.05
    return laplacian

laplace_numba(image);

%timeit laplace_numba(image)

154 µs ± 3.3 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

np.allclose(laplace_numpy(image), laplace_numba(image))

True

numba guvectorize¶

from numba import guvectorize

@guvectorize('void(float64[:, :], float64[:, :])', "(m, n)->(m, n)", \
           nopython=True, fastmath = True)
def laplace_numba_guvectorize(image, laplacian):
    """Laplace operator in NumPy for 2D images. Numba accelerated."""
    h = image.shape[0]
    w = image.shape[1]
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i-1, j-1] = np.abs(4 * image[i, j] - image[i - 1, j] - \
               image[i + 1, j] - image[i, j + 1] - image[i, j - 1]) > 0.05

laplacian = np.empty_like(image)

laplace_numba_guvectorize(image, laplacian);

np.allclose(laplace_numpy(image), laplacian[:-2, :-2])

True

%timeit laplace_numba_guvectorize(image, laplacian);

368 µs ± 4.56 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

from numba import guvectorize
import math
@guvectorize('void(float64[:, :], float64[:, :])', "(m, n)->(m, n)", \
             target='parallel',nopython=True, fastmath = True)
def laplace_numba_guvectorize(image, laplacian):
    """Laplace operator in NumPy for 2D images. Numba accelerated."""
    h = image.shape[0]
    w = image.shape[1]
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i-1, j-1] = abs(4 * image[i, j] - image[i - 1, j] - \
               image[i + 1, j] - image[i, j + 1] - image[i, j - 1]) > 0.05

laplacian = np.empty_like(image)

laplace_numba_guvectorize(image, laplacian);

np.allclose(laplace_numpy(image), laplacian[:-2, :-2])

True

%timeit laplace_numba_guvectorize(image, laplacian);

401 µs ± 3.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

PyCUDA¶

import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule
import matplotlib.pyplot as p

image = image.astype(np.float32)
(h,w)=image.shape
print (h,w)

mod_copy_texture=SourceModule(
"""
#include <cmath>
texture<float,2>tex;
__global__ void  copy_texture_kernel(float *C,float * data)
 {
  int i = threadIdx.x+(blockIdx.x*(blockDim.x));
  int j = threadIdx.y+(blockIdx.y*(blockDim.y));
  int h=C[0];
  int w=C⁴;
  while(i<w)
  {
  while(j<h)
  {
  data[i+w*j] = abs(4*tex2D(tex,j,i)-tex2D(tex,j-1,i)-tex2D(tex,j+1,i)\
  -tex2D(tex,j,i+1)-tex2D(tex,j,i-1)) > 0.05;
  __syncthreads();
  j += blockDim.y * gridDim.y;
  }
  i += blockDim.x * gridDim.x;
  }
}
""")
copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel")
texref = mod_copy_texture.get_texref("tex")
cuda.matrix_to_texref(image , texref , order = "F")
gpu_output = np.empty_like(image)
copy_texture_func(cuda.In(np.float32([h,w])),cuda.Out(gpu_output),\
          block=(32,32,1), grid=(h//32,w//32,1), texrefs=[texref])

512 512

%timeit copy_texture_func(cuda.In(np.float32([h,w])),cuda.Out(gpu_output),\
                  block=(32,16, 1), grid=(h//32,w//32,1), texrefs=[texref])

650 µs ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

compare(gpu_output, laplace_numpy(image))

Wrap-up and plots¶

timings = {}
for func in [laplace_skimage, laplace_numpy, laplace_cython, laplace_pythran_highlevel, laplace_numba]:
    t = %timeit -o func(image)
    timings[func.__name__] = t

2.33 ms ± 71.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1.13 ms ± 21 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
514 µs ± 950 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
327 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
154 µs ± 2.99 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

t = %timeit -o laplace_numba_guvectorize(image, laplacian);
timings['laplace_numba_guvectorize'] = t

362 µs ± 4.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

t = %timeit -o copy_texture_func(cuda.In(np.float32([h,w])),cuda.Out(gpu_output),\
                  block=(32,16, 1), grid=(h//32,w//32,1), texrefs=[texref]);
timings['laplace_pycuda'] = t

636 µs ± 3.47 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

import pandas as pd

pd.Series({key: timings[key].average * 1e6 for key in timings}).to_frame(name='timings (μs)').sort_values(by='timings (μs)')

	timings (μs)
laplace_numba	153.610144
laplace_pythran_highlevel	326.623936
laplace_numba_guvectorize	361.773459
laplace_cython	514.308131
laplace_pycuda	635.745086
laplace_numpy	1130.314359
laplace_skimage	2326.259981

fig, ax = plt.subplots(figsize=(20, 12))
plt.rcParams["font.size"] = "30"
pd.Series({key: timings[key].average * 1e6 for key in timings}).to_frame(name='timings (μs)').sort_values(by='timings (μs)').plot(kind='barh', ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x7fa79c53ca90>

pycudaの速度が思ったより芳しくなかったが、これはデータ量が少ないことに起因している。画像サイズがでかいとpycudaが一番速くなる。