pycudaによる画像処理高速化(CPUと処理速度を比較)

pycudaを使った画像加工を検証してみる。このサイトから拝借した下記のコードは画像をCPUとGPUを別個に使ってグレー化し、画像処理にかかった時間を表示してくれる。

import PIL
from PIL import Image
import time
 
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
 
def blackWhite(inPath , outPath , mode = "luminosity",log = 0):
 
    if log == 1 :
        print ("----------> SERIAL CONVERSION")
    totalT0 = time.time()
 
    im = Image.open(inPath)
    px = numpy.array(im)
 
    getDataT1 = time.time()
 
    print ("-----> Opening path :" , inPath)
 
    processT0 =  time.time()
    for x in range(im.size[1]):
        for y in range(im.size[0]):
 
            r = px[x][y][0]
            g = px[x][y][1]
            b = px[x][y][2]
            if mode == "luminosity" :
                val =  int(0.21 *float(r)  + 0.71*float(g)  + 0.07 * float(b))
 
            else :
                val = int((r +g + b) /3)
 
            px[x][y][0] = val
            px[x][y][1] = val
            px[x][y][2] = val
 
    processT1= time.time()
    #px = numpy.array(im.getdata())
    im = Image.fromarray(px)
    im.save(outPath)
 
    print ("-----> Saving path :" , outPath)
    totalT1 = time.time()
 
    if log == 1 :
        print ("Image size : ",im.size)
        print ("get and convert Image data  : " ,getDataT1-totalT0 )
        print ("Processing data : " , processT1 - processT0 )
        print ("Save image time : " , totalT1-processT1)
        print ("total  Execution time : " ,totalT1-totalT0 )
 
def CudablackWhite(inPath , outPath , mode = "luminosity" , log = 0):
 
    if log == 1 :
        print ("----------> CUDA CONVERSION")
 
    totalT0 = time.time()
 
    im = Image.open(inPath)
    px = numpy.array(im)
    px = px.astype(numpy.float32)
 
    getAndConvertT1 = time.time()
 
    allocT0 = time.time()
    d_px = cuda.mem_alloc(px.nbytes)
    cuda.memcpy_htod(d_px, px)
 
    allocT1 = time.time()
 
    #Kernel declaration
    kernelT0 = time.time()
 
    #Kernel grid and block size
    BLOCK_SIZE = 1024
    block = (1024,1,1)
    checkSize = numpy.int32(im.size[0]*im.size[1])
    grid = (int(im.size[0]*im.size[1]/BLOCK_SIZE)+1,1,1)
 
    #Kernel text
    kernel = """
 
    __global__ void bw( float *inIm, int check ){
 
        int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ;
 
        if(idx *3 < check*3)
        {
        int val = 0.21 *inIm[idx*3] + 0.71*inIm[idx*3+1] + 0.07 * inIm[idx*3+2];
 
        inIm[idx*3]= val;
        inIm[idx*3+1]= val;
        inIm[idx*3+2]= val;
        }
    }
    """     
    #Compile and get kernel function
    mod = SourceModule(kernel)
    func = mod.get_function("bw")
    func(d_px,checkSize, block=block,grid = grid)
 
    kernelT1 = time.time()
 
    #Get back data from gpu
    backDataT0 = time.time()
 
    bwPx = numpy.empty_like(px)
    cuda.memcpy_dtoh(bwPx, d_px)
    bwPx = (numpy.uint8(bwPx))
 
    backDataT1 = time.time()
 
    #Save image
    storeImageT0 = time.time()
    pil_im = Image.fromarray(bwPx,mode ="RGB")
 
    pil_im.save(outPath)
    print ("-----> Saving path :" , outPath)
 
    totalT1 = time.time()
 
    getAndConvertTime = getAndConvertT1 - totalT0
    allocTime = allocT1 - allocT0
    kernelTime = kernelT1 - kernelT0
    backDataTime = backDataT1 - backDataT0
    storeImageTime =totalT1 - storeImageT0
    totalTime = totalT1-totalT0
 
    if log == 1 :
        print ("Image size : ",im.size)
        print ("get and convert Image data to gpu ready : " ,getAndConvertTime )
        print ("allocate mem to gpu: " , allocTime )
        print ("Kernel execution time : " , kernelTime)
        print ("Get data from gpu and convert : " , backDataTime)
        print ("Save image time : " , storeImageTime)
        print ("total  Execution time : " ,totalTime )
スポンサーリンク

小さい画像サイズでテスト

使用する画像を確認する。

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 15

img=mpimg.imread('duck2.jpg')
imgplot = plt.imshow(img)
plt.show()

先ずはCPUによるテスト

inPath = "duck2.jpg"
outPath = "duck4.jpg"
blackWhite(inPath , outPath , mode = "luminosity",log = 1)
----------> SERIAL CONVERSION
-----> Opening path : duck2.jpg
-----> Saving path : duck4.jpg
Image size :  (640, 480)
get and convert Image data  :  0.005021333694458008
Processing data :  0.7377445697784424
Save image time :  0.008235692977905273
total  Execution time :  0.7511696815490723

続いてGPUによるテスト

inPath = "duck2.jpg"
outPath = "duck5.jpg"
CudablackWhite(inPath , outPath , mode = "luminosity" , log = 1)
----------> CUDA CONVERSION
-----> Saving path : duck5.jpg
Image size :  (640, 480)
get and convert Image data to gpu ready :  0.005597591400146484
allocate mem to gpu:  0.0006895065307617188
Kernel execution time :  0.11334371566772461
Get data from gpu and convert :  0.0021092891693115234
Save image time :  0.008312225341796875
total  Execution time :  0.13005352020263672
(0.7511696815490723-0.008235692977905273)/(0.13005352020263672-0.008312225341796875)
6.1025635501938815

処理速度自体は6.1倍高速という結果だった。完成画像を見比べて見る。

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
%matplotlib inline
# figure size in inches optional
rcParams['figure.figsize'] = 20, 30

# read images
img_A = mpimg.imread('duck4.jpg')
img_B = mpimg.imread('duck5.jpg')

# display images
fig, ax = plt.subplots(1,2)
ax[0].imshow(img_A);
ax[1].imshow(img_B);

大きい画像サイズでテスト

テストデータを以下のサイトからダウンロードする。

%download http://imgsrc.hubblesite.org/hvi/uploads/image_file/image_attachment/30797/STSCI-H-p1827f-f-3799x4123.png -f big.png
Downloaded 'big.png'.

ダウンロードした画像を確認。サイズが大きいので小さくしてから表示する。

from PIL import Image
from resizeimage import resizeimage

with open('big.png', 'r+b') as f:
    with Image.open(f) as image:
        cover = resizeimage.resize_cover(image, [100, 100])
        cover.save('new.png', image.format)
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
rcParams['figure.figsize'] = 8, 10

img=mpimg.imread('new.png')
imgplot = plt.imshow(img)
plt.show()
inPath = "big.png"
outPath = "big1.png"
blackWhite(inPath , outPath , mode = "luminosity",log = 1)
----------> SERIAL CONVERSION
-----> Opening path : big.png
-----> Saving path : big1.png
Image size :  (3799, 4123)
get and convert Image data  :  0.3371775150299072
Processing data :  31.579903602600098
Save image time :  3.6182503700256348
total  Execution time :  35.535399198532104
inPath = "big.png"
outPath = "big2.png"
CudablackWhite(inPath , outPath , mode = "luminosity" , log = 1)
----------> CUDA CONVERSION
-----> Saving path : big2.png
Image size :  (3799, 4123)
get and convert Image data to gpu ready :  0.3870694637298584
allocate mem to gpu:  0.018987178802490234
Kernel execution time :  0.11678385734558105
Get data from gpu and convert :  0.09862279891967773
Save image time :  3.523271322250366
total  Execution time :  4.144736289978027
35.535399198532104/4.144736289978027,31.91714882850647/0.6214649677276611
(8.573621266196525, 51.357921179707155)

トータルでの実行速度は8.6倍という結果だったが、イメージファイルの保存にかかる時間を差し引いた純粋な処理速度はCPUの51倍という驚くべき数値を叩き出している。

元画像をグレー化した加工画像を確認

from PIL import Image
from resizeimage import resizeimage

with open('big1.png', 'r+b') as f:
    with Image.open(f) as image:
        cover = resizeimage.resize_cover(image, [100, 100])
        cover.save('new1.png', image.format)
with open('big2.png', 'r+b') as f:
    with Image.open(f) as image:
        cover = resizeimage.resize_cover(image, [100, 100])
        cover.save('new2.png', image.format)
# figure size in inches optional
rcParams['figure.figsize'] = 20, 30

# read images
img_A = mpimg.imread('new1.png')
img_B = mpimg.imread('new2.png')

# display images
fig, ax = plt.subplots(1,2)
ax[0].imshow(img_A);
ax[1].imshow(img_B);

画像処理にGPUを使うとかなりの時間短縮になることが分かった。

参考サイトhttps://stackoverflow.com/