ディープラーニング: フレームワークDeZeroで画像認識をする方法① - ビジネスパーソン・ガジェット置場　empty lot for business

初めてのディープラーニング。この記事はディープラーニングで画像認識をする方法①としてディープラーニングのフレームワークDeZeroを使用する際のデータセットの作成についてまとめます。

DeZeroとは

斎藤康毅さんによる「ゼロから作るDeepLearning③ フレームワーク編」でゼロから作れるフレームワークです。斎藤さんが作成したものはPyPIに登録してありpipでインストールし使用することができます。Define-by-Runの方式が取られており計算グラフが自動で作成され、自動微分によって複雑なコードを書くことなく学習を行うことができます。

今回の画像認識のテーマ

今回はディープラーニングで画像認識をさせる方法①として、このDeZeroを使用してK-POPのIveのメンバーが画像から認識できるように学習を行ってみます。

環境

画像を扱う学習で時間を要してしまうため、今回はGoogle ColaboratoryのGPUで実行します。ちなみに、CPUで実行したところ（してしまったところ）、学習に4時間半かかりましたが、GPUであれば10分程度で完了しました。

実行環境：GoogleColaboratory（ランタイムのタイプ：GPU）

初回画像枚数：学習用（70枚x6人分=420枚）　テスト用（20枚x6人分=120枚）

モデル：VGG16

参照コード

今回は下記のコードを参照させていただき一部変更して実行しています。

DeZeroによるVGG16のファインチューニング - Qiita

実行結果

エポック数は10としています。

学習結果は損失率が0.001、正解率が1.0に対してテストの結果が損失りつ2.014、正解率が0.4666なので完全に過学習となってます。

図示すると損失率は下記のようになりました。

学習データは順調に下がっていますがテストデータは横ばいです。

正解率は下記のようになりました。

こちらも学習データは順調に１になっているのですがテストデータは上がりません。

ですので、テスト結果の向上を次回以降測っていきます。

ちなみにこちらの女の子が誰なのかを今回生成したモデルで確認してみます。（答えはウォニョン）

こちらが今回のモデルが導き出した答えです。

正解！！

次はこちらの画像で誰なのかを確認してみます。（答えはリズ）

こちらが導き出した答え。

不正解。。。

どうも、Iveのメンバーであるウォニョンとユジンは認識精度が高いのですが他のメンバーは認識精度が弱い結果になっているようです。（ちなみに僕はレイのファンです）

フレームワークDeZeroで画像認識をする方法シリーズとして、過学習が起きている状況を改善して正解率が9割を超えられるようにデータ等を改善していきます。

今回の学習用コード

DeZeroは、!pip install dezeroでインストールします。

前述の参照コードを使用させていただいております。

from google.colab import drive
drive.mount('/content/drive')

import glob
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import dezero
import dezero.functions as F
import dezero.layers as L
from dezero import DataLoader, optimizers, Parameter, test_mode, Dataset
from dezero.models import VGG16


path = "./drive/MyDrive/Colab Notebooks/data/"
train_path_iso = glob.glob(path + 'train/イソ/*')
train_path_wonyon = glob.glob(path + 'train/ウォニョン/*')
train_path_gaul = glob.glob(path + 'train/ガウル/*')
train_path_yujin = glob.glob(path + 'train/ユジン/*')
train_path_lizu = glob.glob(path + 'train/リズ/*')
train_path_rey = glob.glob(path + 'train/レイ/*')

test_path_iso = glob.glob(path + 'test/イソ/*')
test_path_wonyon = glob.glob(path + 'test/ウォニョン/*')
test_path_gaul = glob.glob(path + 'test/ガウル/*')
test_path_yujin = glob.glob(path + 'test/ユジン/*')
test_path_lizu = glob.glob(path + 'test/リズ/*')
test_path_rey = glob.glob(path + 'test/レイ/*')

train_path, test_path = [], []
train_path.extend(train_path_iso)
train_path.extend(train_path_wonyon)
train_path.extend(train_path_gaul)
train_path.extend(train_path_yujin)
train_path.extend(train_path_lizu)
train_path.extend(train_path_rey)

test_path.extend(test_path_iso)
test_path.extend(test_path_wonyon)
test_path.extend(test_path_gaul)
test_path.extend(test_path_yujin)
test_path.extend(test_path_lizu)
test_path.extend(test_path_rey)

_train_label = [0]*len(train_path_iso) + [1]*len(train_path_wonyon) + [2]*len(train_path_gaul) + [3]*len(train_path_yujin) + [4]*len(train_path_lizu) + [5]*len(train_path_rey)
train_label = np.array(_train_label)

_test_label = [0.]*len(test_path_iso) + [1]*len(test_path_wonyon) + [2]*len(test_path_gaul) + [3]*len(test_path_yujin) + [4]*len(test_path_lizu) + [5]*len(test_path_rey)
test_label = np.array(_test_label) 

# こちらを参照させていただいたコードから変更させていただきDeZeroのDatasetクラスを継承させています。。
class IveDataset(Dataset):
    def __init__(self, train=True, transform=VGG16.preprocess, target_transform=None):
        super().__init__(train, transform, target_transform)

    # 初期化時に呼び出しされる
    def prepare(self):

        if self.train:
            self.data = train_path
            self.label = train_label
        else:
            self.data = test_path
            self.label = test_label

            
    def __getitem__(self, index):
        img = Image.open(self.data[index])
        img = self.transform(img)
        return img, int(self.label[index])
    
    def __len__(self):
        return len(self.label)

batchsize = 8
train_set = IveDataset(train=True)
train_loader = DataLoader(train_set, batchsize, shuffle=True)

test_set = IveDataset(train=False)
test_loader = DataLoader(test_set, batchsize, shuffle=False)

# こちら忘れると学習に時間がかかります
train_loader.to_gpu()
test_loader.to_gpu()


class FreezeParam:
    def __init__(self, *layers):
        self.freeze_params = []
        for l in layers:
            if isinstance(l, Parameter):
                self.freeze_params.append(l)
            else:
                for p in l.params():
                    self.freeze_params.append(p)
                    
    def __call__(self, params):
        for p in self.freeze_params:
            p.grad.data = 0


model = VGG16(pretrained=True)
model.fc8 = L.Linear(6)
freeze_layers = [model.conv1_1, model.conv1_2, model.conv2_1, model.conv2_2,
                model.conv3_1, model.conv3_2, model.conv3_3,
                model.conv4_1, model.conv4_2, model.conv4_3,
                model.conv5_1, model.conv5_2, model.conv5_3]
freeze_fn = FreezeParam(*freeze_layers)

optimizer = optimizers.AdaGrad(lr=0.001).setup(model)
optimizer.add_hook(freeze_fn)

# こちら忘れると学習に時間がかかります
model.to_gpu()



def test_score(model, test_loader, test_set):
    tmp_loss, tmp_acc = 0.0, 0.0
    
    for x, y in test_loader:
        with test_mode():
            y_pred = model(x)
        loss = F.softmax_cross_entropy(y_pred, y)
        tmp_loss += float(loss.data) * len(y)
        tmp_acc += float(F.accuracy(y_pred, y).data) * len(y)
        
    return tmp_loss / (len(test_set)), tmp_acc / len(test_set)

def train_net(model, train_loader, test_loder, max_epoch=1):
    train_losses, test_losses = [], []
    train_accuracies, test_accuracies = [], []
    
    for epoch in range(max_epoch):
        tmp_loss, tmp_acc = 0.0, 0.0
        for x, y in train_loader:
            y_pred = model(x)
            loss = F.softmax_cross_entropy(y_pred, y)
            
            model.cleargrads()
            loss.backward()
            optimizer.update()
            
            tmp_loss += float(loss.data) * len(y)
            tmp_acc += float(F.accuracy(y_pred, y).data) * len(y)

            
        train_losses.append(tmp_loss / (len(train_set)))
        train_accuracies.append(tmp_acc / (len(train_set)))
        
        test_loss, test_acc = test_score(model, test_loader, test_set)
        test_losses.append(test_loss)
        test_accuracies.append(test_acc)
        print("学習結果：損失率: ", (tmp_loss/(len(train_set))), "正解率: ", (tmp_acc/(len(train_set))))
        print("テスト結果：損失率: ", test_loss, "正解率: ", test_acc)

        
    return model, [train_losses, test_losses, train_accuracies, test_accuracies]

max_epoch = 10
model, progress = train_net(model, train_loader, test_loader, max_epoch)

画像認識を確認するコード

import dezero
from PIL import Image

img = Image.open('./drive/MyDrive/Colab Notebooks/liz.jpeg')
img

x = VGG16.preprocess(img)
x = x[np.newaxis]
model.to_cpu()


with dezero.test_mode():
    y = model(x)
predict_id = np.argmax(y.data)
if predict_id == 0:
    print('イソ')
elif predict_id == 1:
    print('ウォニョン')
elif predict_id == 2:
    print('ガウル')
elif predict_id == 3:
    print('ユジン')
elif predict_id == 4:
    print('リズ')
elif predict_id == 5:
    print('レイ')