cuda10+cudnn7.3+pytorch1.0の実力を試してみる

今回は、PyTorch-1.0の実力を試してみる。前回このブログでpytorch-0.5がかなり速度アップしていることが分かったので、pytorch1.0はどのくらい速度アップしているのかを知りたくなったというわけだ。

スポンサーリンク

Simple Neural Network

import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd  # for display and clear_output
import time  # for sleep and time()

%matplotlib inline
plt.rcParams.update({'font.size': 18, 'font.family': 'STIXGeneral', 'mathtext.fontset': 'stix'})
# Make some training data
n = 20
Xtrain = np.linspace(0., 20.0, n).reshape((n,1)) - 10
Ttrain = 0.2 + 0.05 * (Xtrain+10) + 0.4 * np.sin(Xtrain+10) + 0.2 * np.random.normal(size=(n,1))

# Make some testing data
Xtest = Xtrain + 0.1*np.random.normal(size=(n,1))
Ttest = 0.2 + 0.05 * (Xtest+10) + 0.4 * np.sin(Xtest+10) + 0.2 * np.random.normal(size=(n,1))

Xtrain = Xtrain.astype(np.float32)
Ttrain = Ttrain.astype(np.float32)
Xtest = Xtest.astype(np.float32)
Ttest = Ttest.astype(np.float32)
plt.rcParams['figure.figsize'] = 14, 7
plt.rcParams["font.size"] = "17"
plt.plot(Xtrain, Ttrain, label='Training Data')
plt.plot(Xtest, Ttest, label='Testing Data')
plt.legend();

Numpy実装

def nn_numpy(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=False):

    nSamples = Xtrain.shape[0]
    nOutputs = Ttrain.shape[1]

    rh = rhoh / (nSamples*nOutputs)
    ro = rhoo / (nSamples*nOutputs)

    startTime = time.time()
    
    # Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
    V = 0.1*2*(np.random.uniform(size=(1+1,nHiddens))-0.5).astype(Xtrain.dtype)
    W = 0.1*2*(np.random.uniform(size=(1+nHiddens,nOutputs))-0.5).astype(Xtrain.dtype)

    # collect training and testing errors for plotting
    errorTrace = np.zeros((nReps,2))

    if graphics:
        fig = plt.figure(figsize=(18,18))
        
    for reps in range(nReps):

        # Forward pass on training data
        Z = np.tanh(Xtrain @ V[1:,:] + V[0,:])
        Y = Z @ W[1:,:] + W[0,:]

        # Error in output
        error = Ttrain - Y

        # Backward pass - the backpropagation and weight update steps
        vDelta = ( error @ W[1:,:].T) * (1-Z**2)
        V[1:, :] += rh * Xtrain.T @ vDelta
        V[0, :] += rh * np.sum(vDelta, 0)
    
        W[1:, :] += ro * Z.T @ error
        W[0, :] += ro * np.sum(error, 0)

        # error traces for plotting
        errorTrace[reps,0] = np.sqrt(np.mean((error**2)))
        Ytest = np.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        errorTrace[reps,1] = np.sqrt(np.mean((Ytest-Ttest)**2))

        if graphics and (reps % 1000 == 0 or reps == nReps-1):
                   
            plt.clf()
            plt.subplot(3,1,1)
            plt.plot(errorTrace[:reps+1,:])
            plt.ylim(0,0.7)
            plt.xlabel('Epochs')
            plt.ylabel('RMSE')
            plt.legend(('Train','Test'),loc='best')
        
            plt.subplot(3,1,2)
            plt.plot(Xtrain, Ttrain, 'o-',
                     Xtest, Ttest, 'o-', 
                     Xtest, Ytest, 'o-')
            plt.xlim(-10,10)
            plt.legend(('Training','Testing','Model'),loc='best')
            plt.xlabel('$x$')
            plt.ylabel('Actual and Predicted $f(x)$')
        
            plt.subplot(3,1,3)
            plt.plot(Xtrain, Z)
            plt.ylim(-1.1,1.1)
            plt.xlabel('$x$')
            plt.ylabel('Hidden Unit Outputs ($z$)');
        
            ipd.clear_output(wait=True)
            ipd.display(fig)
            
    endTime = time.time()
    
    if graphics:
        ipd.clear_output(wait=True)
        
    return errorTrace, endTime - startTime

Pytorch

import torch

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
dtype
torch.FloatTensor
Xtrain_torch = torch.from_numpy(Xtrain)
Xtrain_torch.type()
'torch.FloatTensor'
# Make some training data
n = 20

# Xtrain = np.linspace(0.,20.0,n).reshape((n,1)) - 10
# Ttrain = 0.2 + 0.05 * (Xtrain+10) + 0.4 * np.sin(Xtrain+10) + 0.2 * np.random.normal(size=(n,1))
Xtrain_torch = torch.linspace(0.,20.0,n).view((n,1)).type(dtype) - 10
Ttrain_torch = 0.2 + 0.05 * (Xtrain_torch+10) + 0.4 * torch.sin(Xtrain_torch+10) + 0.2 * torch.randn((n,1)).type(dtype)

# Make some testing data
# Xtest = Xtrain + 0.1*np.random.normal(size=(n,1))
# Ttest = 0.2 + 0.05 * (Xtest+10) + 0.4 * np.sin(Xtest+10) + 0.2 * np.random.normal(size=(n,1))
Xtest_torch = Xtrain_torch + 0.1 * torch.randn((n,1)).type(dtype)
Ttest_torch = 0.2 + 0.05 * (Xtest_torch+10) + 0.4 * torch.sin(Xtest_torch+10) + 0.2 * torch.randn((n,1)).type(dtype)
Xtrain_torch = torch.from_numpy(Xtrain)
Ttrain_torch = torch.from_numpy(Ttrain)

Xtest_torch = torch.from_numpy(Xtest)
Ttest_torch = torch.from_numpy(Ttest)
def torch_rms(error):
    m = torch.mean(error**2, 0, keepdim=True)
    mall = torch.mean(m, 1, keepdim=True)
    return torch.sqrt(mall)
dt = torch.FloatTensor
a = torch.ones(5,5).type(dt)
b = a+1.2
print(a)
print(b)
torch_rms(a-b)
tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])
tensor([[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000]])
tensor([[1.2000]])
dt = torch.cuda.FloatTensor
a = torch.ones(5,5).type(dt)
b = a+1.2
print(a)
print(b)
torch_rms(a-b)
tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]], device='cuda:0')
tensor([[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000]], device='cuda:0')
tensor([[1.2000]], device='cuda:0')
def nn_torch(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=False):

    dtype = Xtrain.type()  # if data is on GPU, allocate network variables on GPU, too
    
    nSamples = Xtrain.shape[0]
    nOutputs = Ttrain.shape[1]
    # nSamples = Xtrain.size(0)
    # nOutputs = Ttrain.size(1)

    rh = rhoh / (nSamples*nOutputs)
    ro = rhoo / (nSamples*nOutputs)
    
    startTime = time.time()
    
    # Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
    # V = 0.1*2*(np.random.uniform(size=(1+1,nHiddens))-0.5)
    # W = 0.1*2*(np.random.uniform(size=(1+nHiddens,nOutputs))-0.5)
    V = 0.1*2*(torch.rand(1+1,nHiddens)-0.5).type(dtype)
    W = 0.1*2*(torch.rand(1+nHiddens,nOutputs)-0.5).type(dtype)

    # collect training and testing errors for plotting
    # errorTrace = np.zeros((nReps,2))
    errorTrace = torch.zeros((nReps,2)).type(dtype)

    if graphics:
        fig = plt.figure(figsize=(18,18))

    for reps in range(nReps):

        # Forward pass on training data.  No change going from numpy to pytorch!
        Z = np.tanh(Xtrain @ V[1:,:] + V[0,:])
        Y = Z @ W[1:,:] + W[0,:]

        # Error in output
        error = Ttrain - Y

        # Backward pass - the backpropagation and weight update steps. Only change is in transpose and sum
        # vDelta = ( error @ W[1:,:].T) * (1-Z**2)
        # V[1:, :] += rh * Xtrain.T @ vDelta
        # V[0, :] += rh * np.sum(vDelta, 0)
        vDelta = ( error @ W[1:,:].t() * (1-Z**2) )
        V[1:, :] += rh * Xtrain.t() @ vDelta
        V[0:, :] += rh * torch.sum(vDelta, 0, keepdim=True)  # to prevent conversion to scalar. Necessary?
        
        # W[1:, :] += ro * Z.T @ error
        # W[0, :] += ro * np.sum(error, 0)False,
        W[1:, :] += ro * Z.t() @ error
        W[0:, :] += ro * torch.sum(error, 0, keepdim=True)

        # error traces for plotting
        # errorTrace[reps,0] = np.sqrt(np.mean((error**2)))
        # Ytest = np.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        # errorTrace[reps,1] = np.sqrt(np.mean((Ytest-Ttest)**2))
        errorTrace[reps,0:1] = torch_rms(error)
        Ytest = torch.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        errorTrace[reps,1:2] = torch_rms(Ytest - Ttest)

        if graphics and (reps % 1000 == 0 or reps == nReps-1):
            plt.clf()
            plt.subplot(3,1,1)
            # plt.plot(errorTrace[:reps+1,:])
            if errorTrace.is_cuda:
                plt.plot(errorTrace[:reps+1, :].cpu().numpy())
            else:
                plt.plot(errorTrace[:reps+1, :].numpy())
            plt.ylim(0,0.7)
            plt.xlabel('Epochs')
            plt.ylabel('RMSE')
            plt.legend(('Train','Test'),loc='best')
        
            plt.subplot(3,1,2)
            # plt.plot(Xtrain, Ttrain, 'o-', Xtest, Ttest, 'o-', Xtest, Ytest, 'o-')
            if Xtrain.is_cuda:    
                plt.plot(Xtrain.cpu().numpy(), Ttrain.cpu().numpy(),'o-', 
                         Xtest.cpu().numpy(), Ttest.cpu().numpy(), 'o-', 
                         Xtest.cpu().numpy(), Ytest.cpu().numpy(), 'o-')
            else:
                plt.plot(Xtrain.numpy(), Ttrain.numpy(),'o-', 
                         Xtest.numpy(), Ttest.numpy(), 'o-', 
                         Xtest.numpy(), Ytest.numpy(), 'o-') 
            plt.xlim(-10,10)
            plt.legend(('Training','Testing','Model'),loc='best')
            plt.xlabel('$x$')
            plt.ylabel('Actual and Predicted $f(x)$')
        
            plt.subplot(3,1,3)
            # plt.plot(Xtrain, Z)
            if Xtrain.is_cuda:    
                plt.plot(Xtrain.cpu().numpy(), Z.cpu().numpy())
            else:
                plt.plot(Xtrain.numpy(), Z.numpy())
            plt.ylim(-1.1,1.1)
            plt.xlabel('$x$')
            plt.ylabel('Hidden Unit Outputs ($z$)');
        
            ipd.clear_output(wait=True)
            ipd.display(fig)

    endTime = time.time()
    
    if graphics:
        ipd.clear_output(wait=True)
        
    return errorTrace, endTime - startTime

Pytorch on GPU

torch.cuda.is_available()
True
dtype = torch.cuda.FloatTensor
dtype
torch.cuda.FloatTensor
dtype = torch.cuda.FloatTensor
n = 20

Xtrain_gpu = torch.linspace(0.,20.0,n).view((n,1)).type(dtype) - 10
Ttrain_gpu = 0.2 + 0.05 * (Xtrain_gpu+10) + 0.4 * torch.sin(Xtrain_gpu+10) + 0.2 * torch.randn((n,1)).type(dtype)

# Make some testing data
Xtest_gpu = Xtrain_gpu + 0.1 * torch.randn((n,1)).type(dtype)
Ttest_gpu = 0.2 + 0.05 * (Xtest_gpu+10) + 0.4 * torch.sin(Xtest_gpu+10) + 0.2 * torch.randn((n,1)).type(dtype)
Xtrain_gpu = torch.from_numpy(Xtrain).type(dtype)
Ttrain_gpu = torch.from_numpy(Ttrain).type(dtype)

Xtest_gpu = torch.from_numpy(Xtest).type(dtype)
Ttest_gpu = torch.from_numpy(Ttest).type(dtype)
Xtrain_gpu.type()
'torch.cuda.FloatTensor'
def nn_torch(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=False):

    dtype = Xtrain.type()  # if data is on GPU, allocate network variables on GPU, too
    
    nSamples = Xtrain.shape[0]
    nOutputs = Ttrain.shape[1]
    # nSamples = Xtrain.size(0)
    # nOutputs = Ttrain.size(1)

    rh = rhoh / (nSamples*nOutputs)
    ro = rhoo / (nSamples*nOutputs)
    
    startTime = time.time()
    
    # Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
    # V = 0.1*2*(np.random.uniform(size=(1+1,nHiddens))-0.5)
    # W = 0.1*2*(np.random.uniform(size=(1+nHiddens,nOutputs))-0.5)
    V = 0.1*2*(torch.rand(1+1,nHiddens)-0.5).type(dtype)
    W = 0.1*2*(torch.rand(1+nHiddens,nOutputs)-0.5).type(dtype)

    # collect training and testing errors for plotting
    # errorTrace = np.zeros((nReps,2))
    errorTrace = torch.zeros((nReps,2)).type(dtype)

    if graphics:
        fig = plt.figure(figsize=(18,18))

    for reps in range(nReps):

        # Forward pass on training data.  No change going from numpy to pytorch!
        Z = torch.tanh(Xtrain @ V[1:,:] + V[0,:])
        Y = Z @ W[1:,:] + W[0,:]

        # Error in output
        error = Ttrain - Y

        # Backward pass - the backpropagation and weight update steps. Only change is in transpose and sum
        # vDelta = ( error @ W[1:,:].T) * (1-Z**2)
        # V[1:, :] += rh * Xtrain.T @ vDelta
        # V[0, :] += rh * np.sum(vDelta, 0)
        vDelta = ( error @ W[1:,:].t() * (1-Z**2) )
        V[1:, :] += rh * Xtrain.t() @ vDelta
        V[0:, :] += rh * torch.sum(vDelta, 0, keepdim=True)  # to prevent conversion to scalar. Necessary?
        
        # W[1:, :] += ro * Z.T @ error
        # W[0, :] += ro * np.sum(error, 0)False,
        W[1:, :] += ro * Z.t() @ error
        W[0:, :] += ro * torch.sum(error, 0, keepdim=True)

        # error traces for plotting
        # errorTrace[reps,0] = np.sqrt(np.mean((error**2)))
        # Ytest = np.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        # errorTrace[reps,1] = np.sqrt(np.mean((Ytest-Ttest)**2))
        errorTrace[reps,0:1] = torch_rms(error)
        Ytest = torch.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        errorTrace[reps,1:2] = torch_rms(Ytest - Ttest)

        if graphics and (reps % 1000 == 0 or reps == nReps-1):
            plt.clf()
            plt.subplot(3,1,1)
            # plt.plot(errorTrace[:reps+1,:])
            if errorTrace.is_cuda:
                plt.plot(errorTrace[:reps+1, :].cpu().numpy())
            else:
                plt.plot(errorTrace[:reps+1, :].numpy())
            plt.ylim(0,0.7)
            plt.xlabel('Epochs')
            plt.ylabel('RMSE')
            plt.legend(('Train','Test'),loc='best')
        
            plt.subplot(3,1,2)
            # plt.plot(Xtrain, Ttrain, 'o-', Xtest, Ttest, 'o-', Xtest, Ytest, 'o-')
            if Xtrain.is_cuda:    
                plt.plot(Xtrain.cpu().numpy(), Ttrain.cpu().numpy(),'o-', 
                         Xtest.cpu().numpy(), Ttest.cpu().numpy(), 'o-', 
                         Xtest.cpu().numpy(), Ytest.cpu().numpy(), 'o-')
            else:
                plt.plot(Xtrain.numpy(), Ttrain.numpy(),'o-', 
                         Xtest.numpy(), Ttest.numpy(), 'o-', 
                         Xtest.numpy(), Ytest.numpy(), 'o-') 
            plt.xlim(-10,10)
            plt.legend(('Training','Testing','Model'),loc='best')
            plt.xlabel('$x$')
            plt.ylabel('Actual and Predicted $f(x)$')
        
            plt.subplot(3,1,3)
            # plt.plot(Xtrain, Z)
            if Xtrain.is_cuda:    
                plt.plot(Xtrain.cpu().numpy(), Z.cpu().numpy())
            else:
                plt.plot(Xtrain.numpy(), Z.numpy())
            plt.ylim(-1.1,1.1)
            plt.xlabel('$x$')
            plt.ylabel('Hidden Unit Outputs ($z$)');
        
            ipd.clear_output(wait=True)
            ipd.display(fig)

    endTime = time.time()
    
    if graphics:
        ipd.clear_output(wait=True)
        
    return errorTrace, endTime - startTime

All three, with bigger nets

results = []
for nReps in [1000, 5000, 100000]:
    for nH in [10, 100, 1000, 10000]:
        
        errors_numpy, seconds_numpy = nn_numpy(Xtrain, Ttrain, Xtest, Ttest,
                                               nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)
        
        errors_torch, seconds_torch = nn_torch(Xtrain_torch, Ttrain_torch, Xtest_torch, Ttest_torch,
                                               nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)
        
        errors_gpu, seconds_gpu = nn_torch(Xtrain_gpu, Ttrain_gpu, Xtest_gpu, Ttest_gpu, 
                                           nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)
        
        print('nHidden {:5d} nReps {:7d}, numpy {:8.2f}, torch {:8.2f}, gpu {:8.2f}  errors {:.2f} {:.2f} {:.2f}'.
              format(nH, nReps, seconds_numpy, seconds_torch, seconds_gpu, 
                     errors_numpy[-1,1], errors_torch[-1,1], errors_gpu[-1,1]))
        results.append([nH, nReps, seconds_numpy, seconds_torch, seconds_gpu])
nHidden    10 nReps    1000, numpy     0.10, torch     0.45, gpu     1.07  errors 0.40 0.44 0.44
nHidden   100 nReps    1000, numpy     0.14, torch     0.57, gpu     1.07  errors 0.36 0.36 0.36
nHidden  1000 nReps    1000, numpy     1.75, torch     0.97, gpu     1.06  errors 0.32 0.34 0.32
nHidden 10000 nReps    1000, numpy    12.64, torch     3.84, gpu     1.07  errors 1.79 0.40 0.43
nHidden    10 nReps    5000, numpy     0.41, torch     2.37, gpu     5.26  errors 0.36 0.36 0.36
nHidden   100 nReps    5000, numpy     0.69, torch     2.51, gpu     5.24  errors 0.36 0.36 0.36
nHidden  1000 nReps    5000, numpy     9.31, torch     4.58, gpu     5.40  errors 0.30 0.30 0.29
nHidden 10000 nReps    5000, numpy    87.69, torch    17.12, gpu     5.33  errors 0.48 1.00 0.32
nHidden    10 nReps  100000, numpy     7.87, torch    49.92, gpu   107.32  errors 0.31 0.32 0.31
nHidden   100 nReps  100000, numpy    13.81, torch    50.71, gpu   106.52  errors 0.30 0.31 0.30
nHidden  1000 nReps  100000, numpy   198.23, torch    83.84, gpu   105.46  errors 0.28 0.28 0.27
nHidden 10000 nReps  100000, numpy  1512.25, torch   419.72, gpu   107.23  errors 0.68 1.18 0.97
results = np.array(results)

legends = []
nH = results[:4, 0:1]

if False:
    rows = results[:,1] == 1000
    plt.semilogx(nH,results[rows,2:])
    legends = ['nReps 1000 ' + s for s in ['np', 'torch', 'gpu']]

    rows = results[:,1] == 5000
    plt.semilogx(nH,results[rows, 2:])
    legends += ['nReps 5000 ' + s for s in ['np', 'torch', 'gpu']]

rows = results[:,1] == 100000

plt.semilogx(nH,results[rows, 2:], 'o-')
legends += ['nReps 100000 ' + s for s in ['np', 'torch', 'gpu']]
plt.ylabel('Seconds')
plt.xlabel('Number of Hidden Units')
plt.legend(legends);

結果は、numpyが速度アップしている一方で、pytorch-cpuとpytorch-gpuは大幅に速度ダウンしていた。予想外の結果にかなりの衝撃を受けている。cuda10との相性が悪いのかどうかは分からないが、pytorchがcuda10にまだ最適化されていないのが原因と思われる。また暫くしたら試してみようと思う。

参考サイトSimple Neural Network