NumPy, PyTorch(CPU), PyTorch(GPU)の速度比較

今回は、このサイトのPyTorch tutorialをやる。このチュートリアルは、NumPy, PyTorch(CPU), PyTorch(GPU)の速度比較をしているのが興味深い。numpyからpytorchへ、PyTorch(CPU)からPyTorch(GPU)へのコードの書き換えの良い手本にもなるので効果的と言える。

スポンサーリンク

Simple Neural Network

単純な2層ニューラルネットワークは2通りに実装される。一つはnumpy実装で、小回帰を使って例証し、もう一つは、この実装をpytorchに変換してから同じデータで例証する。最後に、numpy, pytorch(CPU), pytorch(GPU)間の速度比較をする。

import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd  # for display and clear_output
import time  # for sleep and time()

%matplotlib inline
plt.rcParams.update({'font.size': 18, 'font.family': 'STIXGeneral', 'mathtext.fontset': 'stix'})
# Make some training data
n = 20
Xtrain = np.linspace(0., 20.0, n).reshape((n,1)) - 10
Ttrain = 0.2 + 0.05 * (Xtrain+10) + 0.4 * np.sin(Xtrain+10) + 0.2 * np.random.normal(size=(n,1))

# Make some testing data
Xtest = Xtrain + 0.1*np.random.normal(size=(n,1))
Ttest = 0.2 + 0.05 * (Xtest+10) + 0.4 * np.sin(Xtest+10) + 0.2 * np.random.normal(size=(n,1))

Xtrain = Xtrain.astype(np.float32)
Ttrain = Ttrain.astype(np.float32)
Xtest = Xtest.astype(np.float32)
Ttest = Ttest.astype(np.float32)
plt.rcParams['figure.figsize'] = 14, 7
plt.rcParams["font.size"] = "17"
plt.plot(Xtrain, Ttrain, label='Training Data')
plt.plot(Xtest, Ttest, label='Testing Data')
plt.legend();

Numpy実装

def nn_numpy(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=False):

    nSamples = Xtrain.shape[0]
    nOutputs = Ttrain.shape[1]

    rh = rhoh / (nSamples*nOutputs)
    ro = rhoo / (nSamples*nOutputs)

    startTime = time.time()
    
    # Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
    V = 0.1*2*(np.random.uniform(size=(1+1,nHiddens))-0.5).astype(Xtrain.dtype)
    W = 0.1*2*(np.random.uniform(size=(1+nHiddens,nOutputs))-0.5).astype(Xtrain.dtype)

    # collect training and testing errors for plotting
    errorTrace = np.zeros((nReps,2))

    if graphics:
        fig = plt.figure(figsize=(18,18))
        
    for reps in range(nReps):

        # Forward pass on training data
        Z = np.tanh(Xtrain @ V[1:,:] + V[0,:])
        Y = Z @ W[1:,:] + W[0,:]

        # Error in output
        error = Ttrain - Y

        # Backward pass - the backpropagation and weight update steps
        vDelta = ( error @ W[1:,:].T) * (1-Z**2)
        V[1:, :] += rh * Xtrain.T @ vDelta
        V[0, :] += rh * np.sum(vDelta, 0)
    
        W[1:, :] += ro * Z.T @ error
        W[0, :] += ro * np.sum(error, 0)

        # error traces for plotting
        errorTrace[reps,0] = np.sqrt(np.mean((error**2)))
        Ytest = np.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        errorTrace[reps,1] = np.sqrt(np.mean((Ytest-Ttest)**2))

        if graphics and (reps % 1000 == 0 or reps == nReps-1):
                   
            plt.clf()
            plt.subplot(3,1,1)
            plt.plot(errorTrace[:reps+1,:])
            plt.ylim(0,0.7)
            plt.xlabel('Epochs')
            plt.ylabel('RMSE')
            plt.legend(('Train','Test'),loc='best')
        
            plt.subplot(3,1,2)
            plt.plot(Xtrain, Ttrain, 'o-',
                     Xtest, Ttest, 'o-', 
                     Xtest, Ytest, 'o-')
            plt.xlim(-10,10)
            plt.legend(('Training','Testing','Model'),loc='best')
            plt.xlabel('$x$')
            plt.ylabel('Actual and Predicted $f(x)$')
        
            plt.subplot(3,1,3)
            plt.plot(Xtrain, Z)
            plt.ylim(-1.1,1.1)
            plt.xlabel('$x$')
            plt.ylabel('Hidden Unit Outputs ($z$)');
        
            ipd.clear_output(wait=True)
            ipd.display(fig)
            
    endTime = time.time()
    
    if graphics:
        ipd.clear_output(wait=True)
        
    return errorTrace, endTime - startTime
errorTrace, _ = nn_numpy(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.5, rhoo=0.1, nReps=50000, graphics=True)

今度はグラフィック無しで実行時間を測る。

_, seconds = nn_numpy(Xtrain, Ttrain, Xtest, Ttest, nHiddens=1000, rhoh=0.01, rhoo=0.001, nReps=500000, graphics=False)
seconds
952.2789857387543

Pytorch

次に、pytorch版をJustin Johnsonのこのリポの例に則してやる。

import torch
dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
dtype
torch.FloatTensor
Xtrain_torch = torch.from_numpy(Xtrain)
Xtrain_torch.type()
'torch.FloatTensor'
# Make some training data
n = 20

# Xtrain = np.linspace(0.,20.0,n).reshape((n,1)) - 10
# Ttrain = 0.2 + 0.05 * (Xtrain+10) + 0.4 * np.sin(Xtrain+10) + 0.2 * np.random.normal(size=(n,1))
Xtrain_torch = torch.linspace(0.,20.0,n).view((n,1)).type(dtype) - 10
Ttrain_torch = 0.2 + 0.05 * (Xtrain_torch+10) + 0.4 * torch.sin(Xtrain_torch+10) + 0.2 * torch.randn((n,1)).type(dtype)

# Make some testing data
# Xtest = Xtrain + 0.1*np.random.normal(size=(n,1))
# Ttest = 0.2 + 0.05 * (Xtest+10) + 0.4 * np.sin(Xtest+10) + 0.2 * np.random.normal(size=(n,1))
Xtest_torch = Xtrain_torch + 0.1 * torch.randn((n,1)).type(dtype)
Ttest_torch = 0.2 + 0.05 * (Xtest_torch+10) + 0.4 * torch.sin(Xtest_torch+10) + 0.2 * torch.randn((n,1)).type(dtype)

同じランダムに生成されたデータを使う。

Xtrain_torch = torch.from_numpy(Xtrain)
Ttrain_torch = torch.from_numpy(Ttrain)

Xtest_torch = torch.from_numpy(Xtest)
Ttest_torch = torch.from_numpy(Ttest)

いったんpytorchがスカラー型をインクルードすればこれは必要ないかもしれない。

def torch_rms(error):
    m = torch.mean(error**2, 0, keepdim=True)
    mall = torch.mean(m, 1, keepdim=True)
    return torch.sqrt(mall)
dt = torch.FloatTensor
a = torch.ones(5,5).type(dt)
b = a+1.2
print(a)
print(b)
torch_rms(a-b)
tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])
tensor([[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000]])
tensor([[1.2000]])
dt = torch.cuda.FloatTensor
a = torch.ones(5,5).type(dt)
b = a+1.2
print(a)
print(b)
torch_rms(a-b)
tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]], device='cuda:0')
tensor([[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
        [2.2000, 2.2000, 2.2000, 2.2000, 2.2000]], device='cuda:0')
tensor([[1.2000]], device='cuda:0')
def nn_torch(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=False):

    dtype = Xtrain.type()  # if data is on GPU, allocate network variables on GPU, too
    
    nSamples = Xtrain.shape[0]
    nOutputs = Ttrain.shape[1]
    # nSamples = Xtrain.size(0)
    # nOutputs = Ttrain.size(1)

    rh = rhoh / (nSamples*nOutputs)
    ro = rhoo / (nSamples*nOutputs)
    
    startTime = time.time()
    
    # Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
    # V = 0.1*2*(np.random.uniform(size=(1+1,nHiddens))-0.5)
    # W = 0.1*2*(np.random.uniform(size=(1+nHiddens,nOutputs))-0.5)
    V = 0.1*2*(torch.rand(1+1,nHiddens)-0.5).type(dtype)
    W = 0.1*2*(torch.rand(1+nHiddens,nOutputs)-0.5).type(dtype)

    # collect training and testing errors for plotting
    # errorTrace = np.zeros((nReps,2))
    errorTrace = torch.zeros((nReps,2)).type(dtype)

    if graphics:
        fig = plt.figure(figsize=(18,18))

    for reps in range(nReps):

        # Forward pass on training data.  No change going from numpy to pytorch!
        Z = np.tanh(Xtrain @ V[1:,:] + V[0,:])
        Y = Z @ W[1:,:] + W[0,:]

        # Error in output
        error = Ttrain - Y

        # Backward pass - the backpropagation and weight update steps. Only change is in transpose and sum
        # vDelta = ( error @ W[1:,:].T) * (1-Z**2)
        # V[1:, :] += rh * Xtrain.T @ vDelta
        # V[0, :] += rh * np.sum(vDelta, 0)
        vDelta = ( error @ W[1:,:].t() * (1-Z**2) )
        V[1:, :] += rh * Xtrain.t() @ vDelta
        V[0, :] += rh * torch.sum(vDelta, 0, keepdim=True)  # to prevent conversion to scalar. Necessary?
        
        # W[1:, :] += ro * Z.T @ error
        # W[0, :] += ro * np.sum(error, 0)False,
        W[1:, :] += ro * Z.t() @ error
        W[0, :] += ro * torch.sum(error, 0, keepdim=True)

        # error traces for plotting
        # errorTrace[reps,0] = np.sqrt(np.mean((error**2)))
        # Ytest = np.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        # errorTrace[reps,1] = np.sqrt(np.mean((Ytest-Ttest)**2))
        errorTrace[reps,0:1] = torch_rms(error)
        Ytest = torch.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        errorTrace[reps,1:2] = torch_rms(Ytest - Ttest)

        if graphics and (reps % 1000 == 0 or reps == nReps-1):
            plt.clf()
            plt.subplot(3,1,1)
            # plt.plot(errorTrace[:reps+1,:])
            if errorTrace.is_cuda:
                plt.plot(errorTrace[:reps+1, :].cpu().numpy())
            else:
                plt.plot(errorTrace[:reps+1, :].numpy())
            plt.ylim(0,0.7)
            plt.xlabel('Epochs')
            plt.ylabel('RMSE')
            plt.legend(('Train','Test'),loc='best')
        
            plt.subplot(3,1,2)
            # plt.plot(Xtrain, Ttrain, 'o-', Xtest, Ttest, 'o-', Xtest, Ytest, 'o-')
            if Xtrain.is_cuda:    
                plt.plot(Xtrain.cpu().numpy(), Ttrain.cpu().numpy(),'o-', 
                         Xtest.cpu().numpy(), Ttest.cpu().numpy(), 'o-', 
                         Xtest.cpu().numpy(), Ytest.cpu().numpy(), 'o-')
            else:
                plt.plot(Xtrain.numpy(), Ttrain.numpy(),'o-', 
                         Xtest.numpy(), Ttest.numpy(), 'o-', 
                         Xtest.numpy(), Ytest.numpy(), 'o-') 
            plt.xlim(-10,10)
            plt.legend(('Training','Testing','Model'),loc='best')
            plt.xlabel('$x$')
            plt.ylabel('Actual and Predicted $f(x)$')
        
            plt.subplot(3,1,3)
            # plt.plot(Xtrain, Z)
            if Xtrain.is_cuda:    
                plt.plot(Xtrain.cpu().numpy(), Z.cpu().numpy())
            else:
                plt.plot(Xtrain.numpy(), Z.numpy())
            plt.ylim(-1.1,1.1)
            plt.xlabel('$x$')
            plt.ylabel('Hidden Unit Outputs ($z$)');
        
            ipd.clear_output(wait=True)
            ipd.display(fig)

    endTime = time.time()
    
    if graphics:
        ipd.clear_output(wait=True)
        
    return errorTrace, endTime - startTime
errorTrace, _ = nn_torch(Xtrain_torch, Ttrain_torch, Xtest_torch, Ttest_torch, 
                         nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=True)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-15-0f7e5d50646e> in <module>()
      1 errorTrace, _ = nn_torch(Xtrain_torch, Ttrain_torch, Xtest_torch, Ttest_torch, 
----> 2                          nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=True)

<ipython-input-14-41bdb63b2125> in nn_torch(Xtrain, Ttrain, Xtest, Ttest, nHiddens, rhoh, rhoo, nReps, graphics)
     41         vDelta = ( error @ W[1:,:].t() * (1-Z**2) )
     42         V[1:, :] += rh * Xtrain.t() @ vDelta
---> 43         V[0, :] += rh * torch.sum(vDelta, 0, keepdim=True)  # to prevent conversion to scalar. Necessary?
     44 
     45         # W[1:, :] += ro * Z.T @ error

RuntimeError: output with shape [10] doesn't match the broadcast shape [1, 10]
<Figure size 1296x1296 with 0 Axes>
def nn_torch(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=False):

    dtype = Xtrain.type()  # if data is on GPU, allocate network variables on GPU, too
    
    nSamples = Xtrain.shape[0]
    nOutputs = Ttrain.shape[1]
    # nSamples = Xtrain.size(0)
    # nOutputs = Ttrain.size(1)

    rh = rhoh / (nSamples*nOutputs)
    ro = rhoo / (nSamples*nOutputs)
    
    startTime = time.time()
    
    # Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
    # V = 0.1*2*(np.random.uniform(size=(1+1,nHiddens))-0.5)
    # W = 0.1*2*(np.random.uniform(size=(1+nHiddens,nOutputs))-0.5)
    V = 0.1*2*(torch.rand(1+1,nHiddens)-0.5).type(dtype)
    W = 0.1*2*(torch.rand(1+nHiddens,nOutputs)-0.5).type(dtype)

    # collect training and testing errors for plotting
    # errorTrace = np.zeros((nReps,2))
    errorTrace = torch.zeros((nReps,2)).type(dtype)

    if graphics:
        fig = plt.figure(figsize=(18,18))

    for reps in range(nReps):

        # Forward pass on training data.  No change going from numpy to pytorch!
        Z = np.tanh(Xtrain @ V[1:,:] + V[0,:])
        Y = Z @ W[1:,:] + W[0,:]

        # Error in output
        error = Ttrain - Y

        # Backward pass - the backpropagation and weight update steps. Only change is in transpose and sum
        # vDelta = ( error @ W[1:,:].T) * (1-Z**2)
        # V[1:, :] += rh * Xtrain.T @ vDelta
        # V[0, :] += rh * np.sum(vDelta, 0)
        vDelta = ( error @ W[1:,:].t() * (1-Z**2) )
        V[1:, :] += rh * Xtrain.t() @ vDelta
        V[0:, :] += rh * torch.sum(vDelta, 0, keepdim=True)  # to prevent conversion to scalar. Necessary?
        
        # W[1:, :] += ro * Z.T @ error
        # W[0, :] += ro * np.sum(error, 0)False,
        W[1:, :] += ro * Z.t() @ error
        W[0:, :] += ro * torch.sum(error, 0, keepdim=True)

        # error traces for plotting
        # errorTrace[reps,0] = np.sqrt(np.mean((error**2)))
        # Ytest = np.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        # errorTrace[reps,1] = np.sqrt(np.mean((Ytest-Ttest)**2))
        errorTrace[reps,0:1] = torch_rms(error)
        Ytest = torch.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        errorTrace[reps,1:2] = torch_rms(Ytest - Ttest)

        if graphics and (reps % 1000 == 0 or reps == nReps-1):
            plt.clf()
            plt.subplot(3,1,1)
            # plt.plot(errorTrace[:reps+1,:])
            if errorTrace.is_cuda:
                plt.plot(errorTrace[:reps+1, :].cpu().numpy())
            else:
                plt.plot(errorTrace[:reps+1, :].numpy())
            plt.ylim(0,0.7)
            plt.xlabel('Epochs')
            plt.ylabel('RMSE')
            plt.legend(('Train','Test'),loc='best')
        
            plt.subplot(3,1,2)
            # plt.plot(Xtrain, Ttrain, 'o-', Xtest, Ttest, 'o-', Xtest, Ytest, 'o-')
            if Xtrain.is_cuda:    
                plt.plot(Xtrain.cpu().numpy(), Ttrain.cpu().numpy(),'o-', 
                         Xtest.cpu().numpy(), Ttest.cpu().numpy(), 'o-', 
                         Xtest.cpu().numpy(), Ytest.cpu().numpy(), 'o-')
            else:
                plt.plot(Xtrain.numpy(), Ttrain.numpy(),'o-', 
                         Xtest.numpy(), Ttest.numpy(), 'o-', 
                         Xtest.numpy(), Ytest.numpy(), 'o-') 
            plt.xlim(-10,10)
            plt.legend(('Training','Testing','Model'),loc='best')
            plt.xlabel('$x$')
            plt.ylabel('Actual and Predicted $f(x)$')
        
            plt.subplot(3,1,3)
            # plt.plot(Xtrain, Z)
            if Xtrain.is_cuda:    
                plt.plot(Xtrain.cpu().numpy(), Z.cpu().numpy())
            else:
                plt.plot(Xtrain.numpy(), Z.numpy())
            plt.ylim(-1.1,1.1)
            plt.xlabel('$x$')
            plt.ylabel('Hidden Unit Outputs ($z$)');
        
            ipd.clear_output(wait=True)
            ipd.display(fig)

    endTime = time.time()
    
    if graphics:
        ipd.clear_output(wait=True)
        
    return errorTrace, endTime - startTime
errorTrace, _ = nn_torch(Xtrain_torch, Ttrain_torch, Xtest_torch, Ttest_torch, 
                         nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=True)
_, seconds = nn_torch(Xtrain_torch, Ttrain_torch, Xtest_torch, Ttest_torch,
                      nHiddens=10, rhoh=0.5, rhoo=0.1, nReps=50000, graphics=False)
seconds
21.48136854171753

Pytorch on GPU

torch.cuda.is_available()
True
dtype = torch.cuda.FloatTensor
dtype
torch.cuda.FloatTensor
dtype = torch.cuda.FloatTensor
n = 20

Xtrain_gpu = torch.linspace(0.,20.0,n).view((n,1)).type(dtype) - 10
Ttrain_gpu = 0.2 + 0.05 * (Xtrain_gpu+10) + 0.4 * torch.sin(Xtrain_gpu+10) + 0.2 * torch.randn((n,1)).type(dtype)

# Make some testing data
Xtest_gpu = Xtrain_gpu + 0.1 * torch.randn((n,1)).type(dtype)
Ttest_gpu = 0.2 + 0.05 * (Xtest_gpu+10) + 0.4 * torch.sin(Xtest_gpu+10) + 0.2 * torch.randn((n,1)).type(dtype)
Xtrain_gpu = torch.from_numpy(Xtrain).type(dtype)
Ttrain_gpu = torch.from_numpy(Ttrain).type(dtype)

Xtest_gpu = torch.from_numpy(Xtest).type(dtype)
Ttest_gpu = torch.from_numpy(Ttest).type(dtype)
Xtrain_gpu.type()
'torch.cuda.FloatTensor'
errorTrace, _ = nn_torch(Xtrain_gpu, Ttrain_gpu, Xtest_gpu, Ttest_gpu,
                         nHiddens=10, rhoh=0.5, rhoo=0.1, nReps=50000, graphics=True)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-24-62023d513a8e> in <module>()
      1 errorTrace, _ = nn_torch(Xtrain_gpu, Ttrain_gpu, Xtest_gpu, Ttest_gpu,
----> 2                          nHiddens=10, rhoh=0.5, rhoo=0.1, nReps=50000, graphics=True)

<ipython-input-23-719c08ecd04d> in nn_torch(Xtrain, Ttrain, Xtest, Ttest, nHiddens, rhoh, rhoo, nReps, graphics)
     29 
     30         # Forward pass on training data.  No change going from numpy to pytorch!
---> 31         Z = np.tanh(Xtrain @ V[1:,:] + V[0,:])
     32         Y = Z @ W[1:,:] + W[0,:]
     33 

~/.pyenv/versions/py365/lib/python3.6/site-packages/torch/tensor.py in __array__(self, dtype)
    410     def __array__(self, dtype=None):
    411         if dtype is None:
--> 412             return self.numpy()
    413         else:
    414             return self.numpy().astype(dtype, copy=False)

TypeError: can't convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
<Figure size 1296x1296 with 0 Axes>
def nn_torch(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=False):

    dtype = Xtrain.type()  # if data is on GPU, allocate network variables on GPU, too
    
    nSamples = Xtrain.shape[0]
    nOutputs = Ttrain.shape[1]
    # nSamples = Xtrain.size(0)
    # nOutputs = Ttrain.size(1)

    rh = rhoh / (nSamples*nOutputs)
    ro = rhoo / (nSamples*nOutputs)
    
    startTime = time.time()
    
    # Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
    # V = 0.1*2*(np.random.uniform(size=(1+1,nHiddens))-0.5)
    # W = 0.1*2*(np.random.uniform(size=(1+nHiddens,nOutputs))-0.5)
    V = 0.1*2*(torch.rand(1+1,nHiddens)-0.5).type(dtype)
    W = 0.1*2*(torch.rand(1+nHiddens,nOutputs)-0.5).type(dtype)

    # collect training and testing errors for plotting
    # errorTrace = np.zeros((nReps,2))
    errorTrace = torch.zeros((nReps,2)).type(dtype)

    if graphics:
        fig = plt.figure(figsize=(18,18))

    for reps in range(nReps):

        # Forward pass on training data.  No change going from numpy to pytorch!
        Z = torch.tanh(Xtrain @ V[1:,:] + V[0,:])
        Y = Z @ W[1:,:] + W[0,:]

        # Error in output
        error = Ttrain - Y

        # Backward pass - the backpropagation and weight update steps. Only change is in transpose and sum
        # vDelta = ( error @ W[1:,:].T) * (1-Z**2)
        # V[1:, :] += rh * Xtrain.T @ vDelta
        # V[0, :] += rh * np.sum(vDelta, 0)
        vDelta = ( error @ W[1:,:].t() * (1-Z**2) )
        V[1:, :] += rh * Xtrain.t() @ vDelta
        V[0:, :] += rh * torch.sum(vDelta, 0, keepdim=True)  # to prevent conversion to scalar. Necessary?
        
        # W[1:, :] += ro * Z.T @ error
        # W[0, :] += ro * np.sum(error, 0)False,
        W[1:, :] += ro * Z.t() @ error
        W[0:, :] += ro * torch.sum(error, 0, keepdim=True)

        # error traces for plotting
        # errorTrace[reps,0] = np.sqrt(np.mean((error**2)))
        # Ytest = np.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        # errorTrace[reps,1] = np.sqrt(np.mean((Ytest-Ttest)**2))
        errorTrace[reps,0:1] = torch_rms(error)
        Ytest = torch.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
        errorTrace[reps,1:2] = torch_rms(Ytest - Ttest)

        if graphics and (reps % 1000 == 0 or reps == nReps-1):
            plt.clf()
            plt.subplot(3,1,1)
            # plt.plot(errorTrace[:reps+1,:])
            if errorTrace.is_cuda:
                plt.plot(errorTrace[:reps+1, :].cpu().numpy())
            else:
                plt.plot(errorTrace[:reps+1, :].numpy())
            plt.ylim(0,0.7)
            plt.xlabel('Epochs')
            plt.ylabel('RMSE')
            plt.legend(('Train','Test'),loc='best')
        
            plt.subplot(3,1,2)
            # plt.plot(Xtrain, Ttrain, 'o-', Xtest, Ttest, 'o-', Xtest, Ytest, 'o-')
            if Xtrain.is_cuda:    
                plt.plot(Xtrain.cpu().numpy(), Ttrain.cpu().numpy(),'o-', 
                         Xtest.cpu().numpy(), Ttest.cpu().numpy(), 'o-', 
                         Xtest.cpu().numpy(), Ytest.cpu().numpy(), 'o-')
            else:
                plt.plot(Xtrain.numpy(), Ttrain.numpy(),'o-', 
                         Xtest.numpy(), Ttest.numpy(), 'o-', 
                         Xtest.numpy(), Ytest.numpy(), 'o-') 
            plt.xlim(-10,10)
            plt.legend(('Training','Testing','Model'),loc='best')
            plt.xlabel('$x$')
            plt.ylabel('Actual and Predicted $f(x)$')
        
            plt.subplot(3,1,3)
            # plt.plot(Xtrain, Z)
            if Xtrain.is_cuda:    
                plt.plot(Xtrain.cpu().numpy(), Z.cpu().numpy())
            else:
                plt.plot(Xtrain.numpy(), Z.numpy())
            plt.ylim(-1.1,1.1)
            plt.xlabel('$x$')
            plt.ylabel('Hidden Unit Outputs ($z$)');
        
            ipd.clear_output(wait=True)
            ipd.display(fig)

    endTime = time.time()
    
    if graphics:
        ipd.clear_output(wait=True)
        
    return errorTrace, endTime - startTime
errorTrace, _ = nn_torch(Xtrain_gpu, Ttrain_gpu, Xtest_gpu, Ttest_gpu,
                         nHiddens=10, rhoh=0.5, rhoo=0.1, nReps=50000, graphics=True)
_, seconds = nn_torch(Xtrain_gpu, Ttrain_gpu, Xtest_gpu, Ttest_gpu,
                      nHiddens=5000, rhoh=0.001, rhoo=0.001, nReps=50000, graphics=False)
seconds
32.83034038543701

All three, with bigger nets

元コードは時間がかかり過ぎるので、nRepsを減らした。

results = []
for nReps in [1000, 5000, 20000]:
    for nH in [10, 100, 1000, 10000]:
        
        errors_numpy, seconds_numpy = nn_numpy(Xtrain, Ttrain, Xtest, Ttest,
                                               nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)
        
        errors_torch, seconds_torch = nn_torch(Xtrain_torch, Ttrain_torch, Xtest_torch, Ttest_torch,
                                               nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)
        
        errors_gpu, seconds_gpu = nn_torch(Xtrain_gpu, Ttrain_gpu, Xtest_gpu, Ttest_gpu, 
                                           nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)
        
        print('nHidden {:5d} nReps {:7d}, numpy {:8.2f}, torch {:8.2f}, gpu {:8.2f}  errors {:.2f} {:.2f} {:.2f}'.
              format(nH, nReps, seconds_numpy, seconds_torch, seconds_gpu, 
                     errors_numpy[-1,1], errors_torch[-1,1], errors_gpu[-1,1]))
        results.append([nH, nReps, seconds_numpy, seconds_torch, seconds_gpu])
nHidden    10 nReps    1000, numpy     0.10, torch     0.37, gpu     0.64  errors 0.44 0.39 0.46
nHidden   100 nReps    1000, numpy     0.14, torch     0.44, gpu     0.65  errors 0.34 0.34 0.34
nHidden  1000 nReps    1000, numpy     1.84, torch     0.78, gpu     0.65  errors 0.35 0.35 0.36
nHidden 10000 nReps    1000, numpy    13.09, torch     2.38, gpu     0.66  errors 0.94 0.90 0.68
nHidden    10 nReps    5000, numpy     0.38, torch     2.67, gpu     3.85  errors 0.35 0.35 0.35
nHidden   100 nReps    5000, numpy     0.70, torch     2.93, gpu     3.29  errors 0.34 0.34 0.34
nHidden  1000 nReps    5000, numpy     9.83, torch     3.41, gpu     3.28  errors 0.35 0.35 0.35
nHidden 10000 nReps    5000, numpy    71.22, torch    11.12, gpu     3.33  errors 0.49 0.45 0.51
nHidden    10 nReps   20000, numpy     1.56, torch     7.90, gpu    12.72  errors 0.34 0.34 0.34
nHidden   100 nReps   20000, numpy     2.75, torch     8.48, gpu    12.80  errors 0.34 0.34 0.34
nHidden  1000 nReps   20000, numpy    39.31, torch    13.29, gpu    13.13  errors 0.34 0.36 0.34
nHidden 10000 nReps   20000, numpy   315.43, torch    41.79, gpu    13.23  errors 0.42 0.37 0.50
results
[[10, 1000, 0.09626293182373047, 0.3693883419036865, 0.6446230411529541],
 [100, 1000, 0.1426990032196045, 0.4350268840789795, 0.6464996337890625],
 [1000, 1000, 1.844691276550293, 0.7803530693054199, 0.6508212089538574],
 [10000, 1000, 13.086198806762695, 2.3752691745758057, 0.6644260883331299],
 [10, 5000, 0.3769991397857666, 2.6731884479522705, 3.854335069656372],
 [100, 5000, 0.6954240798950195, 2.9310402870178223, 3.288994073867798],
 [1000, 5000, 9.830886840820312, 3.406135082244873, 3.282198190689087],
 [10000, 5000, 71.21766066551208, 11.124212741851807, 3.3327584266662598],
 [10, 20000, 1.5561366081237793, 7.895970106124878, 12.721657514572144],
 [100, 20000, 2.747685670852661, 8.4824800491333, 12.801572799682617],
 [1000, 20000, 39.311296701431274, 13.29249095916748, 13.128490209579468],
 [10000, 20000, 315.4342806339264, 41.791707277297974, 13.232482671737671]]
results = np.array(results)

legends = []
nH = results[:4, 0:1]

if False:
    rows = results[:,1] == 1000
    plt.semilogx(nH,results[rows,2:])
    legends = ['nReps 1000 ' + s for s in ['np', 'torch', 'gpu']]

    rows = results[:,1] == 5000
    plt.semilogx(nH,results[rows, 2:])
    legends += ['nReps 5000 ' + s for s in ['np', 'torch', 'gpu']]

rows = results[:,1] == 20000
plt.semilogx(nH,results[rows, 2:], 'o-')
legends += ['nReps 20000 ' + s for s in ['np', 'torch', 'gpu']]

plt.ylabel('Seconds')
plt.xlabel('Number of Hidden Units')
plt.legend(legends);

pytorch(CPU)がGPUと大差があまりないことに驚かされたので、nRepsを100000にして再度コードを走らせてみた。

results = []
for nReps in [1000, 5000, 100000]:
    for nH in [10, 100, 1000, 10000]:
        
        errors_numpy, seconds_numpy = nn_numpy(Xtrain, Ttrain, Xtest, Ttest,
                                               nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)
        
        errors_torch, seconds_torch = nn_torch(Xtrain_torch, Ttrain_torch, Xtest_torch, Ttest_torch,
                                               nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)
        
        errors_gpu, seconds_gpu = nn_torch(Xtrain_gpu, Ttrain_gpu, Xtest_gpu, Ttest_gpu, 
                                           nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)
        
        print('nHidden {:5d} nReps {:7d}, numpy {:8.2f}, torch {:8.2f}, gpu {:8.2f}  errors {:.2f} {:.2f} {:.2f}'.
              format(nH, nReps, seconds_numpy, seconds_torch, seconds_gpu, 
                     errors_numpy[-1,1], errors_torch[-1,1], errors_gpu[-1,1]))
        results.append([nH, nReps, seconds_numpy, seconds_torch, seconds_gpu])
nHidden    10 nReps    1000, numpy     0.08, torch     0.37, gpu     0.66  errors 0.43 0.44 0.39
nHidden   100 nReps    1000, numpy     0.14, torch     0.51, gpu     0.68  errors 0.34 0.34 0.34
nHidden  1000 nReps    1000, numpy     2.04, torch     0.73, gpu     0.64  errors 0.39 0.36 0.38
nHidden 10000 nReps    1000, numpy    12.30, torch     2.02, gpu     0.65  errors 0.80 0.53 0.56
nHidden    10 nReps    5000, numpy     0.40, torch     1.81, gpu     3.18  errors 0.35 0.35 0.35
nHidden   100 nReps    5000, numpy     0.69, torch     1.94, gpu     3.24  errors 0.34 0.34 0.34
nHidden  1000 nReps    5000, numpy     8.63, torch     3.21, gpu     3.29  errors 0.35 0.34 0.36
nHidden 10000 nReps    5000, numpy    82.07, torch    10.59, gpu     3.26  errors 0.70 0.40 0.44
nHidden    10 nReps  100000, numpy     7.50, torch    39.12, gpu    63.81  errors 0.33 0.34 0.33
nHidden   100 nReps  100000, numpy    13.75, torch    42.42, gpu    65.74  errors 0.34 0.34 0.34
nHidden  1000 nReps  100000, numpy   205.15, torch    66.14, gpu    64.05  errors 0.35 0.34 0.34
nHidden 10000 nReps  100000, numpy  1515.25, torch   203.42, gpu    65.33  errors 0.41 0.54 0.52
results = np.array(results)

legends = []
nH = results[:4, 0:1]

if False:
    rows = results[:,1] == 1000
    plt.semilogx(nH,results[rows,2:])
    legends = ['nReps 1000 ' + s for s in ['np', 'torch', 'gpu']]

    rows = results[:,1] == 5000
    plt.semilogx(nH,results[rows, 2:])
    legends += ['nReps 5000 ' + s for s in ['np', 'torch', 'gpu']]

rows = results[:,1] == 100000
plt.semilogx(nH,results[rows, 2:], 'o-')
legends += ['nReps 100000 ' + s for s in ['np', 'torch', 'gpu']]

plt.ylabel('Seconds')
plt.xlabel('Number of Hidden Units')
plt.legend(legends);

pytorch(cpu)はバージョンアップでかなり高速化したように見受けられる。