# cuda10+cudnn7.3+pytorch1.0の実力を試してみる

スポンサーリンク

## Simple Neural Network¶

import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd  # for display and clear_output
import time  # for sleep and time()

%matplotlib inline
plt.rcParams.update({'font.size': 18, 'font.family': 'STIXGeneral', 'mathtext.fontset': 'stix'})

# Make some training data
n = 20
Xtrain = np.linspace(0., 20.0, n).reshape((n,1)) - 10
Ttrain = 0.2 + 0.05 * (Xtrain+10) + 0.4 * np.sin(Xtrain+10) + 0.2 * np.random.normal(size=(n,1))

# Make some testing data
Xtest = Xtrain + 0.1*np.random.normal(size=(n,1))
Ttest = 0.2 + 0.05 * (Xtest+10) + 0.4 * np.sin(Xtest+10) + 0.2 * np.random.normal(size=(n,1))

Xtrain = Xtrain.astype(np.float32)
Ttrain = Ttrain.astype(np.float32)
Xtest = Xtest.astype(np.float32)
Ttest = Ttest.astype(np.float32)

plt.rcParams['figure.figsize'] = 14, 7
plt.rcParams["font.size"] = "17"
plt.plot(Xtrain, Ttrain, label='Training Data')
plt.plot(Xtest, Ttest, label='Testing Data')
plt.legend();

スポンサーリンク

## Numpy実装¶

def nn_numpy(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=False):

nSamples = Xtrain.shape[0]
nOutputs = Ttrain.shape[1]

rh = rhoh / (nSamples*nOutputs)
ro = rhoo / (nSamples*nOutputs)

startTime = time.time()

# Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
V = 0.1*2*(np.random.uniform(size=(1+1,nHiddens))-0.5).astype(Xtrain.dtype)
W = 0.1*2*(np.random.uniform(size=(1+nHiddens,nOutputs))-0.5).astype(Xtrain.dtype)

# collect training and testing errors for plotting
errorTrace = np.zeros((nReps,2))

if graphics:
fig = plt.figure(figsize=(18,18))

for reps in range(nReps):

# Forward pass on training data
Z = np.tanh(Xtrain @ V[1:,:] + V[0,:])
Y = Z @ W[1:,:] + W[0,:]

# Error in output
error = Ttrain - Y

# Backward pass - the backpropagation and weight update steps
vDelta = ( error @ W[1:,:].T) * (1-Z**2)
V[1:, :] += rh * Xtrain.T @ vDelta
V[0, :] += rh * np.sum(vDelta, 0)

W[1:, :] += ro * Z.T @ error
W[0, :] += ro * np.sum(error, 0)

# error traces for plotting
errorTrace[reps,0] = np.sqrt(np.mean((error**2)))
Ytest = np.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
errorTrace[reps,1] = np.sqrt(np.mean((Ytest-Ttest)**2))

if graphics and (reps % 1000 == 0 or reps == nReps-1):

plt.clf()
plt.subplot(3,1,1)
plt.plot(errorTrace[:reps+1,:])
plt.ylim(0,0.7)
plt.xlabel('Epochs')
plt.ylabel('RMSE')
plt.legend(('Train','Test'),loc='best')

plt.subplot(3,1,2)
plt.plot(Xtrain, Ttrain, 'o-',
Xtest, Ttest, 'o-',
Xtest, Ytest, 'o-')
plt.xlim(-10,10)
plt.legend(('Training','Testing','Model'),loc='best')
plt.xlabel('$x$')
plt.ylabel('Actual and Predicted $f(x)$')

plt.subplot(3,1,3)
plt.plot(Xtrain, Z)
plt.ylim(-1.1,1.1)
plt.xlabel('$x$')
plt.ylabel('Hidden Unit Outputs ($z$)');

ipd.clear_output(wait=True)
ipd.display(fig)

endTime = time.time()

if graphics:
ipd.clear_output(wait=True)

return errorTrace, endTime - startTime

スポンサーリンク

## Pytorch¶

import torch

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
dtype

torch.FloatTensor
Xtrain_torch = torch.from_numpy(Xtrain)

Xtrain_torch.type()

'torch.FloatTensor'
# Make some training data
n = 20

# Xtrain = np.linspace(0.,20.0,n).reshape((n,1)) - 10
# Ttrain = 0.2 + 0.05 * (Xtrain+10) + 0.4 * np.sin(Xtrain+10) + 0.2 * np.random.normal(size=(n,1))
Xtrain_torch = torch.linspace(0.,20.0,n).view((n,1)).type(dtype) - 10
Ttrain_torch = 0.2 + 0.05 * (Xtrain_torch+10) + 0.4 * torch.sin(Xtrain_torch+10) + 0.2 * torch.randn((n,1)).type(dtype)

# Make some testing data
# Xtest = Xtrain + 0.1*np.random.normal(size=(n,1))
# Ttest = 0.2 + 0.05 * (Xtest+10) + 0.4 * np.sin(Xtest+10) + 0.2 * np.random.normal(size=(n,1))
Xtest_torch = Xtrain_torch + 0.1 * torch.randn((n,1)).type(dtype)
Ttest_torch = 0.2 + 0.05 * (Xtest_torch+10) + 0.4 * torch.sin(Xtest_torch+10) + 0.2 * torch.randn((n,1)).type(dtype)

Xtrain_torch = torch.from_numpy(Xtrain)
Ttrain_torch = torch.from_numpy(Ttrain)

Xtest_torch = torch.from_numpy(Xtest)
Ttest_torch = torch.from_numpy(Ttest)

def torch_rms(error):
m = torch.mean(error**2, 0, keepdim=True)
mall = torch.mean(m, 1, keepdim=True)

dt = torch.FloatTensor
a = torch.ones(5,5).type(dt)
b = a+1.2
print(a)
print(b)
torch_rms(a-b)

tensor([[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]])
tensor([[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
[2.2000, 2.2000, 2.2000, 2.2000, 2.2000]])

tensor([[1.2000]])
dt = torch.cuda.FloatTensor
a = torch.ones(5,5).type(dt)
b = a+1.2
print(a)
print(b)
torch_rms(a-b)

tensor([[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1.]], device='cuda:0')
tensor([[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
[2.2000, 2.2000, 2.2000, 2.2000, 2.2000],
[2.2000, 2.2000, 2.2000, 2.2000, 2.2000]], device='cuda:0')

tensor([[1.2000]], device='cuda:0')
def nn_torch(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=False):

dtype = Xtrain.type()  # if data is on GPU, allocate network variables on GPU, too

nSamples = Xtrain.shape[0]
nOutputs = Ttrain.shape[1]
# nSamples = Xtrain.size(0)
# nOutputs = Ttrain.size(1)

rh = rhoh / (nSamples*nOutputs)
ro = rhoo / (nSamples*nOutputs)

startTime = time.time()

# Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
# V = 0.1*2*(np.random.uniform(size=(1+1,nHiddens))-0.5)
# W = 0.1*2*(np.random.uniform(size=(1+nHiddens,nOutputs))-0.5)
V = 0.1*2*(torch.rand(1+1,nHiddens)-0.5).type(dtype)
W = 0.1*2*(torch.rand(1+nHiddens,nOutputs)-0.5).type(dtype)

# collect training and testing errors for plotting
# errorTrace = np.zeros((nReps,2))
errorTrace = torch.zeros((nReps,2)).type(dtype)

if graphics:
fig = plt.figure(figsize=(18,18))

for reps in range(nReps):

# Forward pass on training data.  No change going from numpy to pytorch!
Z = np.tanh(Xtrain @ V[1:,:] + V[0,:])
Y = Z @ W[1:,:] + W[0,:]

# Error in output
error = Ttrain - Y

# Backward pass - the backpropagation and weight update steps. Only change is in transpose and sum
# vDelta = ( error @ W[1:,:].T) * (1-Z**2)
# V[1:, :] += rh * Xtrain.T @ vDelta
# V[0, :] += rh * np.sum(vDelta, 0)
vDelta = ( error @ W[1:,:].t() * (1-Z**2) )
V[1:, :] += rh * Xtrain.t() @ vDelta
V[0:, :] += rh * torch.sum(vDelta, 0, keepdim=True)  # to prevent conversion to scalar. Necessary?

# W[1:, :] += ro * Z.T @ error
# W[0, :] += ro * np.sum(error, 0)False,
W[1:, :] += ro * Z.t() @ error
W[0:, :] += ro * torch.sum(error, 0, keepdim=True)

# error traces for plotting
# errorTrace[reps,0] = np.sqrt(np.mean((error**2)))
# Ytest = np.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
# errorTrace[reps,1] = np.sqrt(np.mean((Ytest-Ttest)**2))
errorTrace[reps,0:1] = torch_rms(error)
Ytest = torch.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
errorTrace[reps,1:2] = torch_rms(Ytest - Ttest)

if graphics and (reps % 1000 == 0 or reps == nReps-1):
plt.clf()
plt.subplot(3,1,1)
# plt.plot(errorTrace[:reps+1,:])
if errorTrace.is_cuda:
plt.plot(errorTrace[:reps+1, :].cpu().numpy())
else:
plt.plot(errorTrace[:reps+1, :].numpy())
plt.ylim(0,0.7)
plt.xlabel('Epochs')
plt.ylabel('RMSE')
plt.legend(('Train','Test'),loc='best')

plt.subplot(3,1,2)
# plt.plot(Xtrain, Ttrain, 'o-', Xtest, Ttest, 'o-', Xtest, Ytest, 'o-')
if Xtrain.is_cuda:
plt.plot(Xtrain.cpu().numpy(), Ttrain.cpu().numpy(),'o-',
Xtest.cpu().numpy(), Ttest.cpu().numpy(), 'o-',
Xtest.cpu().numpy(), Ytest.cpu().numpy(), 'o-')
else:
plt.plot(Xtrain.numpy(), Ttrain.numpy(),'o-',
Xtest.numpy(), Ttest.numpy(), 'o-',
Xtest.numpy(), Ytest.numpy(), 'o-')
plt.xlim(-10,10)
plt.legend(('Training','Testing','Model'),loc='best')
plt.xlabel('$x$')
plt.ylabel('Actual and Predicted $f(x)$')

plt.subplot(3,1,3)
# plt.plot(Xtrain, Z)
if Xtrain.is_cuda:
plt.plot(Xtrain.cpu().numpy(), Z.cpu().numpy())
else:
plt.plot(Xtrain.numpy(), Z.numpy())
plt.ylim(-1.1,1.1)
plt.xlabel('$x$')
plt.ylabel('Hidden Unit Outputs ($z$)');

ipd.clear_output(wait=True)
ipd.display(fig)

endTime = time.time()

if graphics:
ipd.clear_output(wait=True)

return errorTrace, endTime - startTime

スポンサーリンク

## Pytorch on GPU¶

torch.cuda.is_available()

True
dtype = torch.cuda.FloatTensor
dtype

torch.cuda.FloatTensor
dtype = torch.cuda.FloatTensor
n = 20

Xtrain_gpu = torch.linspace(0.,20.0,n).view((n,1)).type(dtype) - 10
Ttrain_gpu = 0.2 + 0.05 * (Xtrain_gpu+10) + 0.4 * torch.sin(Xtrain_gpu+10) + 0.2 * torch.randn((n,1)).type(dtype)

# Make some testing data
Xtest_gpu = Xtrain_gpu + 0.1 * torch.randn((n,1)).type(dtype)
Ttest_gpu = 0.2 + 0.05 * (Xtest_gpu+10) + 0.4 * torch.sin(Xtest_gpu+10) + 0.2 * torch.randn((n,1)).type(dtype)

Xtrain_gpu = torch.from_numpy(Xtrain).type(dtype)
Ttrain_gpu = torch.from_numpy(Ttrain).type(dtype)

Xtest_gpu = torch.from_numpy(Xtest).type(dtype)
Ttest_gpu = torch.from_numpy(Ttest).type(dtype)

Xtrain_gpu.type()

'torch.cuda.FloatTensor'
def nn_torch(Xtrain, Ttrain, Xtest, Ttest, nHiddens=10, rhoh=0.1, rhoo=0.1, nReps=50000, graphics=False):

dtype = Xtrain.type()  # if data is on GPU, allocate network variables on GPU, too

nSamples = Xtrain.shape[0]
nOutputs = Ttrain.shape[1]
# nSamples = Xtrain.size(0)
# nOutputs = Ttrain.size(1)

rh = rhoh / (nSamples*nOutputs)
ro = rhoo / (nSamples*nOutputs)

startTime = time.time()

# Initialize weights to uniformly distributed values between small normally-distributed between -0.1 and 0.1
# V = 0.1*2*(np.random.uniform(size=(1+1,nHiddens))-0.5)
# W = 0.1*2*(np.random.uniform(size=(1+nHiddens,nOutputs))-0.5)
V = 0.1*2*(torch.rand(1+1,nHiddens)-0.5).type(dtype)
W = 0.1*2*(torch.rand(1+nHiddens,nOutputs)-0.5).type(dtype)

# collect training and testing errors for plotting
# errorTrace = np.zeros((nReps,2))
errorTrace = torch.zeros((nReps,2)).type(dtype)

if graphics:
fig = plt.figure(figsize=(18,18))

for reps in range(nReps):

# Forward pass on training data.  No change going from numpy to pytorch!
Z = torch.tanh(Xtrain @ V[1:,:] + V[0,:])
Y = Z @ W[1:,:] + W[0,:]

# Error in output
error = Ttrain - Y

# Backward pass - the backpropagation and weight update steps. Only change is in transpose and sum
# vDelta = ( error @ W[1:,:].T) * (1-Z**2)
# V[1:, :] += rh * Xtrain.T @ vDelta
# V[0, :] += rh * np.sum(vDelta, 0)
vDelta = ( error @ W[1:,:].t() * (1-Z**2) )
V[1:, :] += rh * Xtrain.t() @ vDelta
V[0:, :] += rh * torch.sum(vDelta, 0, keepdim=True)  # to prevent conversion to scalar. Necessary?

# W[1:, :] += ro * Z.T @ error
# W[0, :] += ro * np.sum(error, 0)False,
W[1:, :] += ro * Z.t() @ error
W[0:, :] += ro * torch.sum(error, 0, keepdim=True)

# error traces for plotting
# errorTrace[reps,0] = np.sqrt(np.mean((error**2)))
# Ytest = np.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
# errorTrace[reps,1] = np.sqrt(np.mean((Ytest-Ttest)**2))
errorTrace[reps,0:1] = torch_rms(error)
Ytest = torch.tanh(Xtest @ V[1:, :] + V[0, :]) @ W[1:, :] + W[0, :]  #!! Forward pass in one line
errorTrace[reps,1:2] = torch_rms(Ytest - Ttest)

if graphics and (reps % 1000 == 0 or reps == nReps-1):
plt.clf()
plt.subplot(3,1,1)
# plt.plot(errorTrace[:reps+1,:])
if errorTrace.is_cuda:
plt.plot(errorTrace[:reps+1, :].cpu().numpy())
else:
plt.plot(errorTrace[:reps+1, :].numpy())
plt.ylim(0,0.7)
plt.xlabel('Epochs')
plt.ylabel('RMSE')
plt.legend(('Train','Test'),loc='best')

plt.subplot(3,1,2)
# plt.plot(Xtrain, Ttrain, 'o-', Xtest, Ttest, 'o-', Xtest, Ytest, 'o-')
if Xtrain.is_cuda:
plt.plot(Xtrain.cpu().numpy(), Ttrain.cpu().numpy(),'o-',
Xtest.cpu().numpy(), Ttest.cpu().numpy(), 'o-',
Xtest.cpu().numpy(), Ytest.cpu().numpy(), 'o-')
else:
plt.plot(Xtrain.numpy(), Ttrain.numpy(),'o-',
Xtest.numpy(), Ttest.numpy(), 'o-',
Xtest.numpy(), Ytest.numpy(), 'o-')
plt.xlim(-10,10)
plt.legend(('Training','Testing','Model'),loc='best')
plt.xlabel('$x$')
plt.ylabel('Actual and Predicted $f(x)$')

plt.subplot(3,1,3)
# plt.plot(Xtrain, Z)
if Xtrain.is_cuda:
plt.plot(Xtrain.cpu().numpy(), Z.cpu().numpy())
else:
plt.plot(Xtrain.numpy(), Z.numpy())
plt.ylim(-1.1,1.1)
plt.xlabel('$x$')
plt.ylabel('Hidden Unit Outputs ($z$)');

ipd.clear_output(wait=True)
ipd.display(fig)

endTime = time.time()

if graphics:
ipd.clear_output(wait=True)

return errorTrace, endTime - startTime

スポンサーリンク

## All three, with bigger nets¶

results = []
for nReps in [1000, 5000, 100000]:
for nH in [10, 100, 1000, 10000]:

errors_numpy, seconds_numpy = nn_numpy(Xtrain, Ttrain, Xtest, Ttest,
nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)

errors_torch, seconds_torch = nn_torch(Xtrain_torch, Ttrain_torch, Xtest_torch, Ttest_torch,
nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)

errors_gpu, seconds_gpu = nn_torch(Xtrain_gpu, Ttrain_gpu, Xtest_gpu, Ttest_gpu,
nHiddens=nH, rhoh=0.1, rhoo=0.0001, nReps=nReps)

print('nHidden {:5d} nReps {:7d}, numpy {:8.2f}, torch {:8.2f}, gpu {:8.2f}  errors {:.2f} {:.2f} {:.2f}'.
format(nH, nReps, seconds_numpy, seconds_torch, seconds_gpu,
errors_numpy[-1,1], errors_torch[-1,1], errors_gpu[-1,1]))
results.append([nH, nReps, seconds_numpy, seconds_torch, seconds_gpu])

nHidden    10 nReps    1000, numpy     0.10, torch     0.45, gpu     1.07  errors 0.40 0.44 0.44
nHidden   100 nReps    1000, numpy     0.14, torch     0.57, gpu     1.07  errors 0.36 0.36 0.36
nHidden  1000 nReps    1000, numpy     1.75, torch     0.97, gpu     1.06  errors 0.32 0.34 0.32
nHidden 10000 nReps    1000, numpy    12.64, torch     3.84, gpu     1.07  errors 1.79 0.40 0.43
nHidden    10 nReps    5000, numpy     0.41, torch     2.37, gpu     5.26  errors 0.36 0.36 0.36
nHidden   100 nReps    5000, numpy     0.69, torch     2.51, gpu     5.24  errors 0.36 0.36 0.36
nHidden  1000 nReps    5000, numpy     9.31, torch     4.58, gpu     5.40  errors 0.30 0.30 0.29
nHidden 10000 nReps    5000, numpy    87.69, torch    17.12, gpu     5.33  errors 0.48 1.00 0.32
nHidden    10 nReps  100000, numpy     7.87, torch    49.92, gpu   107.32  errors 0.31 0.32 0.31
nHidden   100 nReps  100000, numpy    13.81, torch    50.71, gpu   106.52  errors 0.30 0.31 0.30
nHidden  1000 nReps  100000, numpy   198.23, torch    83.84, gpu   105.46  errors 0.28 0.28 0.27
nHidden 10000 nReps  100000, numpy  1512.25, torch   419.72, gpu   107.23  errors 0.68 1.18 0.97

results = np.array(results)

legends = []
nH = results[:4, 0:1]

if False:
rows = results[:,1] == 1000
plt.semilogx(nH,results[rows,2:])
legends = ['nReps 1000 ' + s for s in ['np', 'torch', 'gpu']]

rows = results[:,1] == 5000
plt.semilogx(nH,results[rows, 2:])
legends += ['nReps 5000 ' + s for s in ['np', 'torch', 'gpu']]

rows = results[:,1] == 100000

plt.semilogx(nH,results[rows, 2:], 'o-')
legends += ['nReps 100000 ' + s for s in ['np', 'torch', 'gpu']]
plt.ylabel('Seconds')
plt.xlabel('Number of Hidden Units')
plt.legend(legends);


スポンサーリンク
スポンサーリンク

フォローする