ニューラルネットワークの隠れ層を増やしてRastrigin functionの精度を上げよう[c++ Arrayfire]
六花です。
ニューラルネットワークの層を抽象化することができたので、層を簡単に深くすることができるようになりました。
そこで、今回は近似したい関数をSphere functionより複雑なRastrigin functionに変更して、層の深さに応じてどのような変化があるか調べたいと思います。
§環境
windows 10
Microsoft Visual Studio Community 2022 (64 ビット) Version 17.3.4
ArrayFire v3.8.2
constexpr int size_hidden_layer = 1;
for文でモデル構築したいので、そのための変数を宣言します。
// モデル構築
std::vector<af::array> data; // 評価値
data.push_back(af::constant(0.0, size_input, size_batch, dtype_t));
for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer; ++i_hidden_layer)
{
data.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
data.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
}
data.push_back(af::constant(0.0, size_output, size_batch, dtype_t));
std::vector<af::array> grad; // 誤差(傾き)
grad.push_back(af::constant(0.0, size_input, size_batch, dtype_t));
for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer; ++i_hidden_layer)
{
grad.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
grad.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
}
grad.push_back(af::constant(0.0, size_output, size_batch, dtype_t));
std::vector<std::shared_ptr<Layer>> layer; // 処理層
layer.push_back(std::make_shared<FC_layer>(data.at(0), data.at(1), grad.at(0), grad.at(1)));
for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer * 2; i_hidden_layer += 2)
{
layer.push_back(std::make_shared<tanhExp_layer>(data.at(i_hidden_layer + 1), data.at(i_hidden_layer + 2), grad.at(i_hidden_layer + 1), grad.at(i_hidden_layer + 2)));
layer.push_back(std::make_shared<FC_layer>(data.at(i_hidden_layer + 2), data.at(i_hidden_layer + 3), grad.at(i_hidden_layer + 2), grad.at(i_hidden_layer + 3)));
}
for文を使ってモデル構築していきます。
非常に間違えやすいので、構築が終わったらgradient checkingでテストするのが大事だと思います。
(実際、このソースコード書いているときも間違えました。)
特にlayerのfor文の書き方は個性が出ると思います。
隠れ層を6層まで増やした結果を以下に示します。
途中から勾配が発散してしまったため、学習率を半分にしています。
// 隠れ層1層 alpha = 0.001 epoch : 1 diff : 7.37932 norm : 108.335 epoch : 101 diff : 8.29352 norm : 114.789 epoch : 201 diff : 7.1323 norm : 100.212 epoch : 301 diff : 8.35752 norm : 111.828 epoch : 401 diff : 9.42686 norm : 134.095 epoch : 501 diff : 7.58281 norm : 106.214 epoch : 601 diff : 9.0468 norm : 122.5 epoch : 701 diff : 7.90238 norm : 108.289 epoch : 801 diff : 8.02885 norm : 112.663 epoch : 901 diff : 8.30578 norm : 116.401 epoch : 1001 diff : 6.91363 norm : 97.8099 epoch : 1101 diff : 8.07373 norm : 109.409 epoch : 1201 diff : 7.4678 norm : 106.38 epoch : 1301 diff : 6.98367 norm : 94.5784 epoch : 1401 diff : 6.31404 norm : 89.8606 epoch : 1501 diff : 7.02407 norm : 97.0159 epoch : 1601 diff : 7.5026 norm : 103.159 epoch : 1701 diff : 6.66432 norm : 93.4793 epoch : 1801 diff : 6.16135 norm : 89.009 // 隠れ層2層 alpha = 0.001 epoch : 1 diff : 8.90514 norm : 122.009 epoch : 101 diff : 6.84836 norm : 93.566 epoch : 201 diff : 4.69939 norm : 68.5233 epoch : 301 diff : 5.15874 norm : 72.5089 epoch : 401 diff : 1.88894 norm : 30.4353 epoch : 501 diff : 1.41153 norm : 21.0782 epoch : 601 diff : 1.46311 norm : 24.3487 epoch : 701 diff : 1.32166 norm : 19.0509 epoch : 801 diff : 1.32115 norm : 20.1267 epoch : 901 diff : 0.888632 norm : 14.6458 epoch : 1001 diff : 1.1128 norm : 17.5781 epoch : 1101 diff : 0.903911 norm : 13.8627 epoch : 1201 diff : 0.981142 norm : 14.626 epoch : 1301 diff : 0.800413 norm : 12.5818 epoch : 1401 diff : 0.672896 norm : 10.6123 epoch : 1501 diff : 0.679384 norm : 10.8956 epoch : 1601 diff : 0.659158 norm : 10.6668 epoch : 1701 diff : 1.51917 norm : 21.0317 epoch : 1801 diff : 0.547854 norm : 8.07929 // 隠れ層3層 alpha = 0.001 epoch : 1 diff : 7.72143 norm : 108.489 epoch : 101 diff : 1.1357 norm : 17.8866 epoch : 201 diff : 0.467786 norm : 7.54614 epoch : 301 diff : 0.45597 norm : 6.57141 epoch : 401 diff : 0.281907 norm : 4.32094 epoch : 501 diff : 0.487077 norm : 7.00092 epoch : 601 diff : 0.278516 norm : 4.10368 epoch : 701 diff : 0.209454 norm : 3.20719 epoch : 801 diff : 0.233474 norm : 3.60295 epoch : 901 diff : 0.236193 norm : 3.38057 epoch : 1001 diff : 0.189152 norm : 3.14622 epoch : 1101 diff : 0.188179 norm : 2.7994 epoch : 1201 diff : 0.418586 norm : 5.83399 epoch : 1301 diff : 0.194182 norm : 2.76702 epoch : 1401 diff : 0.160596 norm : 2.41861 epoch : 1501 diff : 0.171662 norm : 2.56559 epoch : 1601 diff : 0.160864 norm : 2.38836 epoch : 1701 diff : 0.18597 norm : 2.84894 epoch : 1801 diff : 0.183716 norm : 3.02124 // 隠れ層4層 alpha = 0.0005 epoch : 1 diff : 9.81055 norm : 135.139 epoch : 101 diff : 0.606811 norm : 9.36324 epoch : 201 diff : 0.289493 norm : 4.38984 epoch : 301 diff : 0.482455 norm : 7.27102 epoch : 401 diff : 0.148224 norm : 2.16977 epoch : 501 diff : 0.150126 norm : 2.13533 epoch : 601 diff : 0.11908 norm : 1.76977 epoch : 701 diff : 0.153793 norm : 2.14698 epoch : 801 diff : 0.108161 norm : 1.60318 epoch : 901 diff : 0.173767 norm : 2.83776 epoch : 1001 diff : 0.145149 norm : 2.16293 epoch : 1101 diff : 0.267259 norm : 3.73422 epoch : 1201 diff : 0.125327 norm : 1.8236 epoch : 1301 diff : 0.188406 norm : 2.78781 epoch : 1401 diff : 0.126291 norm : 1.86106 epoch : 1501 diff : 0.380341 norm : 5.96416 epoch : 1601 diff : 0.089635 norm : 1.30651 epoch : 1701 diff : 0.0850607 norm : 1.29814 epoch : 1801 diff : 0.207073 norm : 3.38722 // 隠れ層5層 alpha = 0.0005 epoch : 1 diff : 8.82929 norm : 119.126 epoch : 101 diff : 0.289587 norm : 5.07952 epoch : 201 diff : 0.317917 norm : 5.18149 epoch : 301 diff : 0.144423 norm : 2.07593 epoch : 401 diff : 0.148498 norm : 2.31556 epoch : 501 diff : 0.135578 norm : 2.32463 epoch : 601 diff : 0.169594 norm : 2.405 epoch : 701 diff : 0.101373 norm : 1.51 epoch : 801 diff : 0.0961684 norm : 1.41652 epoch : 901 diff : 0.121638 norm : 1.76756 epoch : 1001 diff : 0.112943 norm : 1.71853 epoch : 1101 diff : 0.066157 norm : 0.967472 epoch : 1201 diff : 0.0773755 norm : 1.14591 epoch : 1301 diff : 0.0572946 norm : 0.83687 epoch : 1401 diff : 0.0538033 norm : 0.761378 epoch : 1501 diff : 0.0622578 norm : 0.891664 epoch : 1601 diff : 0.0577832 norm : 0.854547 epoch : 1701 diff : 0.0629358 norm : 1.00799 epoch : 1801 diff : 0.0524491 norm : 0.786615 // 隠れ層6層 alpha = 0.0005 epoch : 1 diff : 8.48437 norm : 116.941 epoch : 101 diff : 0.265852 norm : 4.21897 epoch : 201 diff : 0.312853 norm : 5.28629 epoch : 301 diff : 0.17947 norm : 3.03704 epoch : 401 diff : 0.118821 norm : 1.96942 epoch : 501 diff : 0.113753 norm : 1.75402 epoch : 601 diff : 0.164634 norm : 3.18647 epoch : 701 diff : 0.066769 norm : 1.07846 epoch : 801 diff : 0.231552 norm : 3.99542 epoch : 901 diff : 0.0594998 norm : 1.0429 epoch : 1001 diff : 0.048821 norm : 0.76639 epoch : 1101 diff : 0.0531493 norm : 0.818055 epoch : 1201 diff : 0.0566023 norm : 1.16451 epoch : 1301 diff : 0.0442795 norm : 0.681057 epoch : 1401 diff : 0.0399107 norm : 0.572991 epoch : 1501 diff : 0.0404391 norm : 0.617015 epoch : 1601 diff : 0.0449341 norm : 0.718099 epoch : 1701 diff : 0.0375647 norm : 0.585123 epoch : 1801 diff : 0.0410178 norm : 0.678535
段々と効果が落ちていくものの、diffが小さくなっていくのが確認できます。
深い層については、もう少しepoch数を増やすと効果が上がるかもしれませんが、今回はこれで打ち切りました。
最後に全文を載せておきます。
#include <arrayfire.h>
#undef max
#undef min
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <array>
#include <vector>
#include <memory>
#include <random>
//using var_t = double;
//constexpr auto dtype_t = af::dtype::f64;
//constexpr var_t eps = 1e-7; // floatのときは1e-4くらい、doubleのときは1e-7くらい
using var_t = float;
constexpr auto dtype_t = af::dtype::f32;
constexpr var_t eps = 1e-4; // floatのときは1e-4くらい、doubleのときは1e-7くらい
struct Layer
{
virtual void init() {} // 勾配の初期化に用いる
virtual void forward() = 0; // 順伝播
virtual void backward() = 0; // 逆伝播
virtual void SGD() {} // 重みの更新
};
struct FC_layer : public Layer
{
const af::array& x;
af::array& y;
af::array& dx;
const af::array& dy;
const int size_batch;
af::array W;
af::array B;
af::array dW;
af::array dB;
FC_layer(const af::array& x, af::array& y, af::array& dx, const af::array& dy)
: x(x)
, y(y)
, dx(dx)
, dy(dy)
, size_batch(x.dims(1))
{
// dimsでその次元方向の要素数を得られる
// Heの初期値
W = af::randn(y.dims(0), x.dims(0), 1, dtype_t) * std::sqrt(var_t(2) / var_t(x.dims(0)));
B = af::randn(y.dims(0), 1, 1, dtype_t) * std::sqrt(var_t(2) / var_t(x.dims(0)));
dW = W * 0.0;
dB = B * 0.0;
}
virtual void init()
{
dW = 0.0;
dB = 0.0;
}
virtual void forward()
{
// af::tileは各次元の方向に整数倍コピーする
y = af::matmul(W, x) + af::tile(B, 1, size_batch);
y.eval();
}
virtual void backward()
{
dy.eval();
// af::matmulNTは2番目の引数の行列を転置して積を求める
// ミニバッチ分の勾配が全て集計されるので、ミニバッチ数が大きくなるとdW・dBに入る値も大きくなる
// 今か後か、ミニバッチ数で割ることが必要
dW += af::matmulNT(dy, x) / (var_t)size_batch;
dB += af::sum(dy, 1) / (var_t)size_batch;
dx += af::matmulTN(W, dy);
dW.eval();
dB.eval();
}
virtual void SGD()
{
//constexpr var_t alpha = (var_t)0.01;
constexpr var_t alpha = (var_t)0.0005;
W -= alpha * dW;
B -= alpha * dB;
W.eval();
B.eval();
}
};
struct ReLU_layer : public Layer
{
const af::array& x;
af::array& y;
af::array& dx;
const af::array& dy;
ReLU_layer(const af::array& x, af::array& y, af::array& dx, const af::array& dy)
: x(x)
, y(y)
, dx(dx)
, dy(dy)
{
}
virtual void forward()
{
y = af::select((x >= 0.0), x, 0.0); // 要素ごとに、x>=0ならばx,そうでないなら0.0
y.eval();
}
virtual void backward()
{
dy.eval();
dx += af::select((x >= 0.0), dy, 0.0); // 要素ごとに、x>=0ならばdy,そうでないなら0.0
}
};
struct tanhExp_layer : public Layer
{
const af::array& x;
af::array& y;
af::array& dx;
const af::array& dy;
tanhExp_layer(const af::array& x, af::array& y, af::array& dx, const af::array& dy)
: x(x)
, y(y)
, dx(dx)
, dy(dy)
{
}
inline void forward()
{
y = x * af::tanh(af::exp(x));
y.eval();
}
inline void backward()
{
dy.eval();
const af::array x_exp = af::exp(x);
const af::array tanh_exp = af::tanh(x_exp);
dx += dy * (tanh_exp - x * x_exp * (tanh_exp * tanh_exp - (var_t)1));
}
};
int main()
{
af::info();
constexpr int size_input = 2;
constexpr int size_hidden = 300;
constexpr int size_output = 1;
constexpr int size_batch = 128;
constexpr int size_hidden_layer = 6;
constexpr int size_data = 100000;
constexpr int size_data_train = (int)(size_data * 0.7);
// Rastrigin function
af::array input = af::randu(size_input, size_data, dtype_t) * (var_t)10.24 - (var_t)5.12;
af::array output = (var_t)(10 * size_input) + af::sum(af::pow2(input) - (var_t)10 * af::cos((var_t)2 * af::Pi * input), 0);
// モデル構築
std::vector<af::array> data; // 評価値
data.push_back(af::constant(0.0, size_input, size_batch, dtype_t));
for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer; ++i_hidden_layer)
{
data.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
data.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
}
data.push_back(af::constant(0.0, size_output, size_batch, dtype_t));
std::vector<af::array> grad; // 誤差(傾き)
grad.push_back(af::constant(0.0, size_input, size_batch, dtype_t));
for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer; ++i_hidden_layer)
{
grad.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
grad.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
}
grad.push_back(af::constant(0.0, size_output, size_batch, dtype_t));
std::vector<std::shared_ptr<Layer>> layer; // 処理層
layer.push_back(std::make_shared<FC_layer>(data.at(0), data.at(1), grad.at(0), grad.at(1)));
for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer * 2; i_hidden_layer += 2)
{
layer.push_back(std::make_shared<tanhExp_layer>(data.at(i_hidden_layer + 1), data.at(i_hidden_layer + 2), grad.at(i_hidden_layer + 1), grad.at(i_hidden_layer + 2)));
layer.push_back(std::make_shared<FC_layer>(data.at(i_hidden_layer + 2), data.at(i_hidden_layer + 3), grad.at(i_hidden_layer + 2), grad.at(i_hidden_layer + 3)));
}
// gradient checking
{
// 入力データの設定まで
auto init_phase = [&]()
{
// 誤差(傾き)の初期化
for (auto& itm : grad) { itm = 0.0; }
for (auto& itm : layer) { itm->init(); }
// 入力データの設定
data.front() = input(af::span, af::seq(0, size_batch - 1));
};
// 順伝播と逆伝播
auto forback = [&]()
{
// 順伝播
for (auto& itm : layer) { itm->forward(); }
// 誤差の計算
grad.back() = data.back() - output(af::span, af::seq(0, size_batch - 1)); // y - t
//逆伝播
std::for_each(layer.rbegin(), layer.rend(), [](auto& itm) { itm->backward(); });
};
var_t grad_test = (var_t)123;
{
// 入力データの設定まで
init_phase();
// 順伝播と逆伝播
forback();
grad_test = af::sum<var_t>(grad.front()(0, 0));
}
var_t grad_plus = (var_t)123;
{
// 入力データの設定まで
init_phase();
// 誤差を加える
data.front()(0, 0) += eps;
// 順伝播と逆伝播
forback();
grad_plus = af::sum<var_t>(af::pow(grad.back(), 2.0)(0, 0) / (var_t)2); // mse
}
var_t grad_minus = (var_t)123;
{
// 入力データの設定まで
init_phase();
// 誤差を加える
data.front()(0, 0) -= eps;
// 順伝播と逆伝播
forback();
grad_minus = af::sum<var_t>(af::pow(grad.back(), 2.0)(0, 0) / (var_t)2); // mse
}
auto gradapprox = (grad_plus - grad_minus) / (2.0 * eps); // 数値微分
auto numerator = sqrt(pow(grad_test - gradapprox, 2));
auto norm_grad = sqrt(grad_test * grad_test);
auto norm_gradapprox = sqrt(gradapprox * gradapprox);
auto denominator = norm_grad + norm_gradapprox;
auto difference = numerator / denominator; // 検証すべき差分
std::cout
<< "--------" << std::endl
<< std::setw(12) << gradapprox << "" << std::endl
<< std::setw(12) << numerator << " = 0" << std::endl
<< std::setw(12) << norm_grad << std::endl
<< std::setw(12) << norm_gradapprox << "" << std::endl
<< std::setw(12) << denominator << std::endl
<< std::setw(12) << difference << " = 0 " << std::endl;
std::cout << "-------" << std::endl;
}
// 学習
int epoch = 0;
constexpr int size_data_in_epoch = size_data_train / size_batch;
while (true)
{
++epoch;
// ランダム選出
// シャッフルされたインデックスを作成する
af::array idx_data;
{
af::array vals_data;
af::array sort_data = af::randu(size_data_train, 1, dtype_t);
af::sort(vals_data, idx_data, sort_data, 0);
}
for (int step = 0; step < size_data_in_epoch; ++step)
{
// 誤差(傾き)の初期化
for (auto& itm : grad) { itm = 0.0; }
for (auto& itm : layer) { itm->init(); }
// 今回のステップの学習対象
// af::seq は範囲を指定するためのもの
// コンストラクタによって指定される範囲が特殊なので以下を参照のこと
// https://arrayfire.org/docs/classaf_1_1seq.htm
af::array idx_target = idx_data(af::seq((step + 0) * size_batch, (step + 1) * size_batch - 1));
// 入力値を設定
data.front() = input(af::span, idx_target);
// 順伝播
for (auto& itm : layer) { itm->forward(); }
// 誤差の計算
grad.back() = data.back() - output(af::span, idx_target); // y - t
//逆伝播
std::for_each(layer.rbegin(), layer.rend(), [](auto& itm) { itm->backward(); });
// 勾配の更新
for (auto& itm : layer) { itm->SGD(); }
}
// 一定期間ごとにログを取る
if (epoch % 100 == 1)
{
auto diff = af::mean<var_t>(af::abs(grad.back()));
auto norm = af::norm(grad.back());
std::cout << "epoch : " << epoch << "\t" << "diff : " << diff << "\t" << "norm : " << norm << std::endl;
}
else
{
std::cout << "epoch : " << epoch << "\r";
}
}
return 0;
}