ニューラルネットワークの隠れ層を増やしてRastrigin functionの精度を上げよう[c++ Arrayfire]

六花です。

ニューラルネットワークの層を抽象化することができたので、層を簡単に深くすることができるようになりました。
そこで、今回は近似したい関数をSphere functionより複雑なRastrigin functionに変更して、層の深さに応じてどのような変化があるか調べたいと思います。

§環境
windows 10
Microsoft Visual Studio Community 2022 (64 ビット) Version 17.3.4
ArrayFire v3.8.2


    constexpr int size_hidden_layer = 1;

for文でモデル構築したいので、そのための変数を宣言します。


    // モデル構築

    std::vector<af::array> data; // 評価値
    data.push_back(af::constant(0.0, size_input, size_batch, dtype_t));
    for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer; ++i_hidden_layer)
    {
        data.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
        data.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
    }
    data.push_back(af::constant(0.0, size_output, size_batch, dtype_t));

    std::vector<af::array> grad; // 誤差(傾き)
    grad.push_back(af::constant(0.0, size_input, size_batch, dtype_t));
    for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer; ++i_hidden_layer)
    {
        grad.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
        grad.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
    }
    grad.push_back(af::constant(0.0, size_output, size_batch, dtype_t));

    std::vector<std::shared_ptr<Layer>> layer; // 処理層
    layer.push_back(std::make_shared<FC_layer>(data.at(0), data.at(1), grad.at(0), grad.at(1)));
    for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer * 2; i_hidden_layer += 2)
    {
        layer.push_back(std::make_shared<tanhExp_layer>(data.at(i_hidden_layer + 1), data.at(i_hidden_layer + 2), grad.at(i_hidden_layer + 1), grad.at(i_hidden_layer + 2)));
        layer.push_back(std::make_shared<FC_layer>(data.at(i_hidden_layer + 2), data.at(i_hidden_layer + 3), grad.at(i_hidden_layer + 2), grad.at(i_hidden_layer + 3)));
    }

for文を使ってモデル構築していきます。
非常に間違えやすいので、構築が終わったらgradient checkingでテストするのが大事だと思います。
(実際、このソースコード書いているときも間違えました。)
特にlayerのfor文の書き方は個性が出ると思います。

隠れ層を6層まで増やした結果を以下に示します。
途中から勾配が発散してしまったため、学習率を半分にしています。

// 隠れ層1層 alpha = 0.001
epoch : 1       diff : 7.37932  norm : 108.335
epoch : 101     diff : 8.29352  norm : 114.789
epoch : 201     diff : 7.1323   norm : 100.212
epoch : 301     diff : 8.35752  norm : 111.828
epoch : 401     diff : 9.42686  norm : 134.095
epoch : 501     diff : 7.58281  norm : 106.214
epoch : 601     diff : 9.0468   norm : 122.5
epoch : 701     diff : 7.90238  norm : 108.289
epoch : 801     diff : 8.02885  norm : 112.663
epoch : 901     diff : 8.30578  norm : 116.401
epoch : 1001    diff : 6.91363  norm : 97.8099
epoch : 1101    diff : 8.07373  norm : 109.409
epoch : 1201    diff : 7.4678   norm : 106.38
epoch : 1301    diff : 6.98367  norm : 94.5784
epoch : 1401    diff : 6.31404  norm : 89.8606
epoch : 1501    diff : 7.02407  norm : 97.0159
epoch : 1601    diff : 7.5026   norm : 103.159
epoch : 1701    diff : 6.66432  norm : 93.4793
epoch : 1801    diff : 6.16135  norm : 89.009

// 隠れ層2層 alpha = 0.001
epoch : 1       diff : 8.90514  norm : 122.009
epoch : 101     diff : 6.84836  norm : 93.566
epoch : 201     diff : 4.69939  norm : 68.5233
epoch : 301     diff : 5.15874  norm : 72.5089
epoch : 401     diff : 1.88894  norm : 30.4353
epoch : 501     diff : 1.41153  norm : 21.0782
epoch : 601     diff : 1.46311  norm : 24.3487
epoch : 701     diff : 1.32166  norm : 19.0509
epoch : 801     diff : 1.32115  norm : 20.1267
epoch : 901     diff : 0.888632 norm : 14.6458
epoch : 1001    diff : 1.1128   norm : 17.5781
epoch : 1101    diff : 0.903911 norm : 13.8627
epoch : 1201    diff : 0.981142 norm : 14.626
epoch : 1301    diff : 0.800413 norm : 12.5818
epoch : 1401    diff : 0.672896 norm : 10.6123
epoch : 1501    diff : 0.679384 norm : 10.8956
epoch : 1601    diff : 0.659158 norm : 10.6668
epoch : 1701    diff : 1.51917  norm : 21.0317
epoch : 1801    diff : 0.547854 norm : 8.07929

// 隠れ層3層 alpha = 0.001
epoch : 1       diff : 7.72143  norm : 108.489
epoch : 101     diff : 1.1357   norm : 17.8866
epoch : 201     diff : 0.467786 norm : 7.54614
epoch : 301     diff : 0.45597  norm : 6.57141
epoch : 401     diff : 0.281907 norm : 4.32094
epoch : 501     diff : 0.487077 norm : 7.00092
epoch : 601     diff : 0.278516 norm : 4.10368
epoch : 701     diff : 0.209454 norm : 3.20719
epoch : 801     diff : 0.233474 norm : 3.60295
epoch : 901     diff : 0.236193 norm : 3.38057
epoch : 1001    diff : 0.189152 norm : 3.14622
epoch : 1101    diff : 0.188179 norm : 2.7994
epoch : 1201    diff : 0.418586 norm : 5.83399
epoch : 1301    diff : 0.194182 norm : 2.76702
epoch : 1401    diff : 0.160596 norm : 2.41861
epoch : 1501    diff : 0.171662 norm : 2.56559
epoch : 1601    diff : 0.160864 norm : 2.38836
epoch : 1701    diff : 0.18597  norm : 2.84894
epoch : 1801    diff : 0.183716 norm : 3.02124

// 隠れ層4層 alpha = 0.0005
epoch : 1       diff : 9.81055  norm : 135.139
epoch : 101     diff : 0.606811 norm : 9.36324
epoch : 201     diff : 0.289493 norm : 4.38984
epoch : 301     diff : 0.482455 norm : 7.27102
epoch : 401     diff : 0.148224 norm : 2.16977
epoch : 501     diff : 0.150126 norm : 2.13533
epoch : 601     diff : 0.11908  norm : 1.76977
epoch : 701     diff : 0.153793 norm : 2.14698
epoch : 801     diff : 0.108161 norm : 1.60318
epoch : 901     diff : 0.173767 norm : 2.83776
epoch : 1001    diff : 0.145149 norm : 2.16293
epoch : 1101    diff : 0.267259 norm : 3.73422
epoch : 1201    diff : 0.125327 norm : 1.8236
epoch : 1301    diff : 0.188406 norm : 2.78781
epoch : 1401    diff : 0.126291 norm : 1.86106
epoch : 1501    diff : 0.380341 norm : 5.96416
epoch : 1601    diff : 0.089635 norm : 1.30651
epoch : 1701    diff : 0.0850607        norm : 1.29814
epoch : 1801    diff : 0.207073 norm : 3.38722

// 隠れ層5層 alpha = 0.0005
epoch : 1       diff : 8.82929  norm : 119.126
epoch : 101     diff : 0.289587 norm : 5.07952
epoch : 201     diff : 0.317917 norm : 5.18149
epoch : 301     diff : 0.144423 norm : 2.07593
epoch : 401     diff : 0.148498 norm : 2.31556
epoch : 501     diff : 0.135578 norm : 2.32463
epoch : 601     diff : 0.169594 norm : 2.405
epoch : 701     diff : 0.101373 norm : 1.51
epoch : 801     diff : 0.0961684        norm : 1.41652
epoch : 901     diff : 0.121638 norm : 1.76756
epoch : 1001    diff : 0.112943 norm : 1.71853
epoch : 1101    diff : 0.066157 norm : 0.967472
epoch : 1201    diff : 0.0773755        norm : 1.14591
epoch : 1301    diff : 0.0572946        norm : 0.83687
epoch : 1401    diff : 0.0538033        norm : 0.761378
epoch : 1501    diff : 0.0622578        norm : 0.891664
epoch : 1601    diff : 0.0577832        norm : 0.854547
epoch : 1701    diff : 0.0629358        norm : 1.00799
epoch : 1801    diff : 0.0524491        norm : 0.786615

// 隠れ層6層 alpha = 0.0005
epoch : 1       diff : 8.48437  norm : 116.941
epoch : 101     diff : 0.265852 norm : 4.21897
epoch : 201     diff : 0.312853 norm : 5.28629
epoch : 301     diff : 0.17947  norm : 3.03704
epoch : 401     diff : 0.118821 norm : 1.96942
epoch : 501     diff : 0.113753 norm : 1.75402
epoch : 601     diff : 0.164634 norm : 3.18647
epoch : 701     diff : 0.066769 norm : 1.07846
epoch : 801     diff : 0.231552 norm : 3.99542
epoch : 901     diff : 0.0594998        norm : 1.0429
epoch : 1001    diff : 0.048821 norm : 0.76639
epoch : 1101    diff : 0.0531493        norm : 0.818055
epoch : 1201    diff : 0.0566023        norm : 1.16451
epoch : 1301    diff : 0.0442795        norm : 0.681057
epoch : 1401    diff : 0.0399107        norm : 0.572991
epoch : 1501    diff : 0.0404391        norm : 0.617015
epoch : 1601    diff : 0.0449341        norm : 0.718099
epoch : 1701    diff : 0.0375647        norm : 0.585123
epoch : 1801    diff : 0.0410178        norm : 0.678535

段々と効果が落ちていくものの、diffが小さくなっていくのが確認できます。
深い層については、もう少しepoch数を増やすと効果が上がるかもしれませんが、今回はこれで打ち切りました。

最後に全文を載せておきます。


#include <arrayfire.h>
#undef max
#undef min

#include <iostream>
#include <iomanip>
#include <algorithm>
#include <array>
#include <vector>
#include <memory>
#include <random>

//using var_t = double;
//constexpr auto dtype_t = af::dtype::f64;
//constexpr var_t eps = 1e-7; // floatのときは1e-4くらい、doubleのときは1e-7くらい

using var_t = float;
constexpr auto dtype_t = af::dtype::f32;
constexpr var_t eps = 1e-4; // floatのときは1e-4くらい、doubleのときは1e-7くらい

struct Layer
{
    virtual void init() {}          // 勾配の初期化に用いる
    virtual void forward() = 0;     // 順伝播
    virtual void backward() = 0;    // 逆伝播
    virtual void SGD() {}           // 重みの更新
};

struct FC_layer : public Layer
{
    const af::array& x;
    af::array& y;
    af::array& dx;
    const af::array& dy;

    const int size_batch;

    af::array W;
    af::array B;

    af::array dW;
    af::array dB;

    FC_layer(const af::array& x, af::array& y, af::array& dx, const af::array& dy)
        : x(x)
        , y(y)
        , dx(dx)
        , dy(dy)
        , size_batch(x.dims(1))
    {
        // dimsでその次元方向の要素数を得られる
        // Heの初期値
        W = af::randn(y.dims(0), x.dims(0), 1, dtype_t) * std::sqrt(var_t(2) / var_t(x.dims(0)));
        B = af::randn(y.dims(0), 1, 1, dtype_t) * std::sqrt(var_t(2) / var_t(x.dims(0)));

        dW = W * 0.0;
        dB = B * 0.0;
    }

    virtual void init()
    {
        dW = 0.0;
        dB = 0.0;
    }

    virtual void forward()
    {
        // af::tileは各次元の方向に整数倍コピーする
        y = af::matmul(W, x) + af::tile(B, 1, size_batch);
        y.eval();
    }

    virtual void backward()
    {
        dy.eval();

        // af::matmulNTは2番目の引数の行列を転置して積を求める
        // ミニバッチ分の勾配が全て集計されるので、ミニバッチ数が大きくなるとdW・dBに入る値も大きくなる
        // 今か後か、ミニバッチ数で割ることが必要
        dW += af::matmulNT(dy, x) / (var_t)size_batch;
        dB += af::sum(dy, 1) / (var_t)size_batch;

        dx += af::matmulTN(W, dy);

        dW.eval();
        dB.eval();
    }

    virtual void SGD()
    {
        //constexpr var_t alpha = (var_t)0.01;
        constexpr var_t alpha = (var_t)0.0005;
        W -= alpha * dW;
        B -= alpha * dB;

        W.eval();
        B.eval();
    }
};

struct ReLU_layer : public Layer
{
    const af::array& x;
    af::array& y;
    af::array& dx;
    const af::array& dy;

    ReLU_layer(const af::array& x, af::array& y, af::array& dx, const af::array& dy)
        : x(x)
        , y(y)
        , dx(dx)
        , dy(dy)
    {
    }

    virtual void forward()
    {
        y = af::select((x >= 0.0), x, 0.0); // 要素ごとに、x>=0ならばx,そうでないなら0.0
        y.eval();
    }

    virtual void backward()
    {
        dy.eval();
        dx += af::select((x >= 0.0), dy, 0.0); // 要素ごとに、x>=0ならばdy,そうでないなら0.0
    }
};

struct tanhExp_layer : public Layer
{
    const af::array& x;
    af::array& y;
    af::array& dx;
    const af::array& dy;

    tanhExp_layer(const af::array& x, af::array& y, af::array& dx, const af::array& dy)
        : x(x)
        , y(y)
        , dx(dx)
        , dy(dy)
    {
    }

    inline void forward()
    {
        y = x * af::tanh(af::exp(x));
        y.eval();
    }

    inline void backward()
    {
        dy.eval();
        const af::array x_exp = af::exp(x);
        const af::array tanh_exp = af::tanh(x_exp);
        dx += dy * (tanh_exp - x * x_exp * (tanh_exp * tanh_exp - (var_t)1));
    }
};

int main()
{
    af::info();

    constexpr int size_input = 2;
    constexpr int size_hidden = 300;
    constexpr int size_output = 1;
    constexpr int size_batch = 128;

    constexpr int size_hidden_layer = 6;

    constexpr int size_data = 100000;
    constexpr int size_data_train = (int)(size_data * 0.7);

    // Rastrigin function
    af::array input = af::randu(size_input, size_data, dtype_t) * (var_t)10.24 - (var_t)5.12;
    af::array output = (var_t)(10 * size_input) + af::sum(af::pow2(input) - (var_t)10 * af::cos((var_t)2 * af::Pi * input), 0);


    // モデル構築

    std::vector<af::array> data; // 評価値
    data.push_back(af::constant(0.0, size_input, size_batch, dtype_t));
    for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer; ++i_hidden_layer)
    {
        data.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
        data.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
    }
    data.push_back(af::constant(0.0, size_output, size_batch, dtype_t));

    std::vector<af::array> grad; // 誤差(傾き)
    grad.push_back(af::constant(0.0, size_input, size_batch, dtype_t));
    for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer; ++i_hidden_layer)
    {
        grad.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
        grad.push_back(af::constant(0.0, size_hidden, size_batch, dtype_t));
    }
    grad.push_back(af::constant(0.0, size_output, size_batch, dtype_t));

    std::vector<std::shared_ptr<Layer>> layer; // 処理層
    layer.push_back(std::make_shared<FC_layer>(data.at(0), data.at(1), grad.at(0), grad.at(1)));
    for (int i_hidden_layer = 0; i_hidden_layer < size_hidden_layer * 2; i_hidden_layer += 2)
    {
        layer.push_back(std::make_shared<tanhExp_layer>(data.at(i_hidden_layer + 1), data.at(i_hidden_layer + 2), grad.at(i_hidden_layer + 1), grad.at(i_hidden_layer + 2)));
        layer.push_back(std::make_shared<FC_layer>(data.at(i_hidden_layer + 2), data.at(i_hidden_layer + 3), grad.at(i_hidden_layer + 2), grad.at(i_hidden_layer + 3)));
    }

    // gradient checking
    {
        // 入力データの設定まで
        auto init_phase = [&]()
        {
            // 誤差(傾き)の初期化
            for (auto& itm : grad) { itm = 0.0; }
            for (auto& itm : layer) { itm->init(); }

            // 入力データの設定
            data.front() = input(af::span, af::seq(0, size_batch - 1));
        };

        // 順伝播と逆伝播
        auto forback = [&]()
        {
            // 順伝播
            for (auto& itm : layer) { itm->forward(); }

            // 誤差の計算
            grad.back() = data.back() - output(af::span, af::seq(0, size_batch - 1)); // y - t

            //逆伝播
            std::for_each(layer.rbegin(), layer.rend(), [](auto& itm) { itm->backward(); });
        };

        var_t grad_test = (var_t)123;
        {
            // 入力データの設定まで
            init_phase();

            // 順伝播と逆伝播
            forback();

            grad_test = af::sum<var_t>(grad.front()(0, 0));
        }

        var_t grad_plus = (var_t)123;
        {
            // 入力データの設定まで
            init_phase();

            // 誤差を加える
            data.front()(0, 0) += eps;

            // 順伝播と逆伝播
            forback();

            grad_plus = af::sum<var_t>(af::pow(grad.back(), 2.0)(0, 0) / (var_t)2); // mse
        }

        var_t grad_minus = (var_t)123;
        {
            // 入力データの設定まで
            init_phase();

            // 誤差を加える
            data.front()(0, 0) -= eps;

            // 順伝播と逆伝播
            forback();

            grad_minus = af::sum<var_t>(af::pow(grad.back(), 2.0)(0, 0) / (var_t)2); // mse
        }

        auto gradapprox = (grad_plus - grad_minus) / (2.0 * eps); // 数値微分
        auto numerator = sqrt(pow(grad_test - gradapprox, 2));
        auto norm_grad = sqrt(grad_test * grad_test);
        auto norm_gradapprox = sqrt(gradapprox * gradapprox);
        auto denominator = norm_grad + norm_gradapprox;
        auto difference = numerator / denominator; // 検証すべき差分

        std::cout
            << "--------" << std::endl
            << std::setw(12) << gradapprox << "" << std::endl
            << std::setw(12) << numerator << " = 0" << std::endl
            << std::setw(12) << norm_grad << std::endl
            << std::setw(12) << norm_gradapprox << "" << std::endl
            << std::setw(12) << denominator << std::endl
            << std::setw(12) << difference << " = 0 " << std::endl;

        std::cout << "-------" << std::endl;
    }

    // 学習
    int epoch = 0;
    constexpr int size_data_in_epoch = size_data_train / size_batch;
    while (true)
    {
        ++epoch;

        // ランダム選出
        // シャッフルされたインデックスを作成する
        af::array idx_data;
        {
            af::array vals_data;
            af::array sort_data = af::randu(size_data_train, 1, dtype_t);
            af::sort(vals_data, idx_data, sort_data, 0);
        }

        for (int step = 0; step < size_data_in_epoch; ++step)
        {
            // 誤差(傾き)の初期化
            for (auto& itm : grad) { itm = 0.0; }
            for (auto& itm : layer) { itm->init(); }

            // 今回のステップの学習対象
            // af::seq は範囲を指定するためのもの
            // コンストラクタによって指定される範囲が特殊なので以下を参照のこと
            // https://arrayfire.org/docs/classaf_1_1seq.htm
            af::array idx_target = idx_data(af::seq((step + 0) * size_batch, (step + 1) * size_batch - 1));

            // 入力値を設定
            data.front() = input(af::span, idx_target);

            // 順伝播
            for (auto& itm : layer) { itm->forward(); }

            // 誤差の計算
            grad.back() = data.back() - output(af::span, idx_target); // y - t

            //逆伝播
            std::for_each(layer.rbegin(), layer.rend(), [](auto& itm) { itm->backward(); });

            // 勾配の更新
            for (auto& itm : layer) { itm->SGD(); }
        }

        // 一定期間ごとにログを取る
        if (epoch % 100 == 1)
        {
            auto diff = af::mean<var_t>(af::abs(grad.back()));
            auto norm = af::norm(grad.back());
            std::cout << "epoch : " << epoch << "\t" << "diff : " << diff << "\t" << "norm : " << norm << std::endl;
        }
        else
        {
            std::cout << "epoch : " << epoch << "\r";
        }
    }

    return 0;
}

おすすめ

コメントを残す

メールアドレスが公開されることはありません。 が付いている欄は必須項目です