PyTorchによる画像分類(alexnet編)

今回はこのサイトのPyTorch tutorialを実践する。このチュートリアルは、モデルとしてalexnetを使用して画像分類を行っている。

スポンサーリンク

Automatic differentiation in pytorch

先ずは、いくつかの層をpytorchを使って実装していく。

import torch
from torch import nn
from torch.nn.parameter import Parameter
from torch.autograd import Variable

# Let's define a linear layer.
class nn_Linear(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(nn_Linear, self).__init__()
        # Create the layer parameters.
        self.weight = Parameter(torch.Tensor(output_dim, input_dim))
        self.bias = Parameter(torch.Tensor(1, output_dim))
        
        # intialize the weight and bias parameters using random values.
        self.weight.data.uniform_(-0.001, 0.001)  # Parameters have .data and .gradient values.
        self.bias.data.uniform_(-0.001, 0.001)    # Parameters have .data and .gradient values.
        
    # y = Wx + b
    def forward(self, x):
        # Here you could try to see what values or sizes have these inputs.
        # print(self.weight.size())
        # print(x.size())
        
        # Note that this type of debugging is not usually possible in tensorflow/keras because
        # in those frameworks these operations only define a computation graph but are not operating
        # directly on values.
        
        batch_expanded_bias = self.bias.expand(x.size(0), self.bias.size(1))
        return torch.addmm(1, batch_expanded_bias.t(), 1, self.weight, x.t()).t()
    

# Let's create an instance of nn_linear
linear = nn_Linear(4, 2)

# Let's define some input variable.
inputVar = Variable(torch.Tensor([[0.2, 0.3, -0.1, 0.2],
                                  [0.3, 0.1, 0.3, -0.4],
                                  [0.1, 0.2, 0.4, -0.4]]))

# Let's print some code output of the linear layer.
outputVar = linear(inputVar)
print(outputVar.data)   # This will contain y = Wx + b
print(outputVar.grad)   # This will contain dy, the gradient of the output after backpropagation.

# This is to show how pytorch's magic. It registers parameters so you can easily traverse them.
print([param.size() for param in linear.parameters()])
tensor([[0.0005, 0.0007],
        [0.0001, 0.0012],
        [0.0001, 0.0011]])
None
[torch.Size([2, 4]), torch.Size([1, 2])]

上のコードについて以下のことに留意する。

  • パラメータは、テンソルがモジュールのパラメーターとして登録され、module.parameters()を呼び出すとリターンされるようにする特別な変数。この事は、パラメータを介した反復だけで最適化を行ってSGDステップを実行するのに役立つ。

    param.data.add_(-0.001 * param.grad.data)
  • 上のラインは、2つ目の留意点につながる。変数(とパラメータ)は、変数(データ)の実際の値と変数(grad)の勾配の2つの値を持つ。この事が、変数である必要がある入力、出力、パラメータを含むモデルに欲しい全ての変数に対する勾配を見い出すことを可能にする。

単一ダミーバッチ用の平均平方誤差損失関数を最小化するためにSGDを使って線形層のパラメーターを移動する例を下記に示す。

class nn_MSECriterion(nn.Module):  # MSE = mean squared error.
    def forward(self, predictions, labels):
        return (predictions - labels).pow(2).sum()
    
inputs = Variable(torch.Tensor([[0.2, 0.3, -0.1, 0.2],
                               [0.3, 0.1, 0.3, -0.4],
                               [0.1, 0.2, 0.4, -0.4]]))

labels = Variable(torch.Tensor([[1, 1],
                                [2, 2],
                                [3, 3]]))

# Now optimize until the loss becomes small.
linear = nn_Linear(4, 2)
linear.train()  # Makes a difference when the module has dropout or batchnorm which behave different during testing.
for iteration in range(0, 50):
    predictions = linear(inputs) # forward pass.
    loss = nn_MSECriterion()(predictions, labels)  # loss function.
    loss.backward() # This backpropagates errors all-the-way.
    linear.weight.data.add_(-0.0001 * linear.weight.grad.data)  # SGD step.
    linear.bias.data.add_(-0.0001 * linear.bias.grad.data)  # SGD step.
    print(iteration, loss.data[0])
0 tensor(27.9894)
1 tensor(27.9537)
2 tensor(27.8826)
3 tensor(27.7762)
4 tensor(27.6347)
5 tensor(27.4587)
6 tensor(27.2484)
7 tensor(27.0047)
8 tensor(26.7280)
9 tensor(26.4193)
10 tensor(26.0793)
11 tensor(25.7090)
12 tensor(25.3094)
13 tensor(24.8817)
14 tensor(24.4270)
15 tensor(23.9466)
16 tensor(23.4418)
17 tensor(22.9140)
18 tensor(22.3648)
19 tensor(21.7955)
20 tensor(21.2078)
21 tensor(20.6034)
22 tensor(19.9839)
23 tensor(19.3509)
24 tensor(18.7064)
25 tensor(18.0521)
26 tensor(17.3897)
27 tensor(16.7211)
28 tensor(16.0482)
29 tensor(15.3729)
30 tensor(14.6969)
31 tensor(14.0222)
32 tensor(13.3507)
33 tensor(12.6842)
34 tensor(12.0245)
35 tensor(11.3735)
36 tensor(10.7330)
37 tensor(10.1047)
38 tensor(9.4904)
39 tensor(8.8917)
40 tensor(8.3104)
41 tensor(7.7480)
42 tensor(7.2061)
43 tensor(6.6861)
44 tensor(6.1894)
45 tensor(5.7176)
46 tensor(5.2718)
47 tensor(4.8532)
48 tensor(4.4630)
49 tensor(4.1023)
/root/.pyenv/versions/py365/lib/python3.6/site-packages/ipykernel_launcher.py:22: UserWarning: invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. Use tensor.item() to convert a 0-dim tensor to a Python number

最後の行のloss.data[0]を警告文の指摘通りloss.item()に書き換える。

class nn_MSECriterion(nn.Module):  # MSE = mean squared error.
    def forward(self, predictions, labels):
        return (predictions - labels).pow(2).sum()
    
inputs = Variable(torch.Tensor([[0.2, 0.3, -0.1, 0.2],
                               [0.3, 0.1, 0.3, -0.4],
                               [0.1, 0.2, 0.4, -0.4]]))

labels = Variable(torch.Tensor([[1, 1],
                                [2, 2],
                                [3, 3]]))

# Now optimize until the loss becomes small.
linear = nn_Linear(4, 2)
linear.train()  # Makes a difference when the module has dropout or batchnorm which behave different during testing.
for iteration in range(0, 50):
    predictions = linear(inputs) # forward pass.
    loss = nn_MSECriterion()(predictions, labels)  # loss function.
    loss.backward() # This backpropagates errors all-the-way.
    linear.weight.data.add_(-0.0001 * linear.weight.grad.data)  # SGD step.
    linear.bias.data.add_(-0.0001 * linear.bias.grad.data)  # SGD step.
    print(iteration, loss.item())
0 28.009300231933594
1 27.97365951538086
2 27.902475357055664
3 27.79595184326172
4 27.65437889099121
5 27.4781494140625
6 27.267757415771484
7 27.023786544799805
8 26.74691390991211
9 26.437908172607422
10 26.097627639770508
11 25.727022171020508
12 25.32712173461914
13 24.899028778076172
14 24.443939208984375
15 23.963119506835938
16 23.457901000976562
17 22.9296875
18 22.379945755004883
19 21.810203552246094
20 21.222042083740234
21 20.61709213256836
22 19.99703598022461
23 19.36359214782715
24 18.718517303466797
25 18.0636043548584
26 17.400665283203125
27 16.731542587280273
28 16.058086395263672
29 15.382162094116211
30 14.705648422241211
31 14.030411720275879
32 13.358325004577637
33 12.691247940063477
34 12.031024932861328
35 11.379481315612793
36 10.738417625427246
37 10.10960578918457
38 9.49477767944336
39 8.895630836486816
40 8.313817977905273
41 7.75093936920166
42 7.208544731140137
43 6.688124656677246
44 6.191106796264648
45 5.718854904174805
46 5.272660732269287
47 4.853744029998779
48 4.4632463455200195
49 4.102230072021484

最初に気付くのは、勾配を計算するのにバックワード関数を書く必要がないということで、パイトーチが使う全演算を実装する限り、バックワードパス機能性を無償で得られる。また、パイトーチは全ての基礎層(といくつかの複雑層)をtorch.nn内(例えば、nn.Sequential, nn.Linear, nn.Conv2D, nn.ReLU, nn.Sigmoid)とtorch.nn.functional内(例えば、層がパラメータを持っていない時に便利なF.relu, F.sigmoid等の関数として利用可能)に既に有しているので、nn_Linear等の基礎関数を実装する必要がない。しかし、自身のモジュールを作成してtorch演算を使用してモジュールのフォワード関数を書き換えることで新しい関数を実装できる。

Convolutional Neural Networks in pytorch

Pytorchは畳み込み層を実装していて、簡単に事前訓練されたモデル(VGG, Resnetなど)にアクセスできる。故に、Kerasやlua-torch並に便利だと言えよう。

import torchvision.models as models
alexnet = models.alexnet(pretrained = True)
print(alexnet)
Downloading: "https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth" to /root/.torch/models/alexnet-owt-4df8aa71.pth
100%|██████████| 244418560/244418560 [00:35<00:00, 6905059.98it/s] 
AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Dropout(p=0.5)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace)
    (3): Dropout(p=0.5)
    (4): Linear(in_features=4096, out_features=4096, bias=True)
    (5): ReLU(inplace)
    (6): Linear(in_features=4096, out_features=1000, bias=True)
  )
)

推論テスト用の画像を下記のサイトからダウンロードする。

%download https://github.com/facebookresearch/deepmask/raw/master/data/testImage.jpg -f test_image.jpg
Downloaded 'test_image.jpg'.

次に、サンプル画像を使って推論を表示してモデルを試す。alexnet.features(image)を使用すれば、このモデルの特徴だけを直に利用できることに留意する。こうすることで、alexenet(image)を使う場合に分類パートの最後の線形層の出力とは対照的に、最後のaxPool2d層の活性を出力する。

import torchvision.transforms as transforms
from PIL import Image
import matplotlib.pyplot as plt
import json, string
%matplotlib inline

# 1. Define the appropriate image pre-processing function.
preprocessFn = transforms.Compose([transforms.Scale(256), 
                                   transforms.CenterCrop(224), 
                                   transforms.ToTensor(), 
                                   transforms.Normalize(mean = [0.485, 0.456, 0.406], 
                                                        std=[0.229, 0.224, 0.225])])

# 2. Load the imagenet class names.
imagenetClasses = {int(idx): entry[1] for (idx, entry) in json.load(open('imagenet_class_index.json')).items()}

# 3. Forward a test image of the toaster.
# Never forget to set in evaluation mode so Dropoff layers don't add randomness.
alexnet.eval()
# unsqueeze(0) adds a dummy batch dimension which is required for all models in pytorch.
image = Image.open('test_image.jpg').convert('RGB')
inputVar =  Variable(preprocessFn(image).unsqueeze(0))
predictions = alexnet(inputVar)

# 4. Decode the top 10 classes predicted for this image.
# We need to apply softmax because the model outputs the last linear layer activations and not softmax scores.
probs, indices = (-nn.Softmax()(predictions).data).sort()
probs = (-probs).numpy()[0][:10]; indices = indices.numpy()[0][:10]
preds = [imagenetClasses[idx] + ': ' + str(prob) for (prob, idx) in zip(probs, indices)]

# 5. Show image and predictions
plt.title(string.join(preds, '\n'))
plt.imshow(image);
/root/.pyenv/versions/py365/lib/python3.6/site-packages/torchvision/transforms/transforms.py:188: UserWarning: The use of the transforms.Scale transform is deprecated, please use transforms.Resize instead.
  "please use transforms.Resize instead.")
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-6-96195316e90c> in <module>()
     13 
     14 # 2. Load the imagenet class names.
---> 15 imagenetClasses = {int(idx): entry[1] for (idx, entry) in json.load(open('imagenet_class_index.json')).items()}
     16 
     17 # 3. Forward a test image of the toaster.

FileNotFoundError: [Errno 2] No such file or directory: 'imagenet_class_index.json'

imagenet_class_index.jsonの場所を探して教えてあげる。

!find / -name imagenet_class_index.json
/root/.keras/models/imagenet_class_index.json
import torchvision.transforms as transforms
from PIL import Image
import matplotlib.pyplot as plt
import json, string
%matplotlib inline

# 1. Define the appropriate image pre-processing function.
preprocessFn = transforms.Compose([transforms.Scale(256), 
                                   transforms.CenterCrop(224), 
                                   transforms.ToTensor(), 
                                   transforms.Normalize(mean = [0.485, 0.456, 0.406], 
                                                        std=[0.229, 0.224, 0.225])])

# 2. Load the imagenet class names.
imagenetClasses = {int(idx): entry[1] for (idx, entry) in json.load(open('/root/.keras/models/imagenet_class_index.json')).items()}

# 3. Forward a test image of the toaster.
# Never forget to set in evaluation mode so Dropoff layers don't add randomness.
alexnet.eval()
# unsqueeze(0) adds a dummy batch dimension which is required for all models in pytorch.
image = Image.open('test_image.jpg').convert('RGB')
inputVar =  Variable(preprocessFn(image).unsqueeze(0))
predictions = alexnet(inputVar)

# 4. Decode the top 10 classes predicted for this image.
# We need to apply softmax because the model outputs the last linear layer activations and not softmax scores.
probs, indices = (-nn.Softmax()(predictions).data).sort()
probs = (-probs).numpy()[0][:10]; indices = indices.numpy()[0][:10]
preds = [imagenetClasses[idx] + ': ' + str(prob) for (prob, idx) in zip(probs, indices)]

# 5. Show image and predictions
plt.title(string.join(preds, '\n'))
plt.imshow(image);
/root/.pyenv/versions/py365/lib/python3.6/site-packages/torchvision/transforms/transforms.py:188: UserWarning: The use of the transforms.Scale transform is deprecated, please use transforms.Resize instead.
  "please use transforms.Resize instead.")
/root/.pyenv/versions/py365/lib/python3.6/site-packages/ipykernel_launcher.py:27: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-8-c02f25ed388b> in <module>()
     30 
     31 # 5. Show image and predictions
---> 32 plt.title(string.join(preds, '\n'))
     33 plt.imshow(image);

AttributeError: module 'string' has no attribute 'join'

stringの部分を”\n “に変更して、”\n “.join(preds)と書き換える。

import torchvision.transforms as transforms
from PIL import Image
import matplotlib.pyplot as plt
import json, string
%matplotlib inline

# 1. Define the appropriate image pre-processing function.
preprocessFn = transforms.Compose([transforms.Scale(256), 
                                   transforms.CenterCrop(224), 
                                   transforms.ToTensor(), 
                                   transforms.Normalize(mean = [0.485, 0.456, 0.406], 
                                                        std=[0.229, 0.224, 0.225])])

# 2. Load the imagenet class names.
imagenetClasses = {int(idx): entry[1] for (idx, entry) in json.load(open('/root/.keras/models/imagenet_class_index.json')).items()}

# 3. Forward a test image of the toaster.
# Never forget to set in evaluation mode so Dropoff layers don't add randomness.
alexnet.eval()
# unsqueeze(0) adds a dummy batch dimension which is required for all models in pytorch.
image = Image.open('test_image.jpg').convert('RGB')
inputVar =  Variable(preprocessFn(image).unsqueeze(0))
predictions = alexnet(inputVar)

# 4. Decode the top 10 classes predicted for this image.
# We need to apply softmax because the model outputs the last linear layer activations and not softmax scores.
probs, indices = (-nn.Softmax()(predictions).data).sort()
probs = (-probs).numpy()[0][:10]; indices = indices.numpy()[0][:10]
preds = [imagenetClasses[idx] + ': ' + str(prob) for (prob, idx) in zip(probs, indices)]

# 5. Show image and predictions
plt.rcParams['figure.figsize'] = 20, 20
plt.rcParams["font.size"] = "20"
plt.title("\n ".join(preds))
plt.imshow(image);
/root/.pyenv/versions/py365/lib/python3.6/site-packages/torchvision/transforms/transforms.py:188: UserWarning: The use of the transforms.Scale transform is deprecated, please use transforms.Resize instead.
  "please use transforms.Resize instead.")
/root/.pyenv/versions/py365/lib/python3.6/site-packages/ipykernel_launcher.py:27: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.