tiny_dnn 1.0.0
A header only, dependency-free deep learning framework in C++11
Loading...
Searching...
No Matches
batch_normalization_layer.h
1/*
2 Copyright (c) 2016, Taiga Nomi
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in the
11 documentation and/or other materials provided with the distribution.
12 * Neither the name of the <organization> nor the
13 names of its contributors may be used to endorse or promote products
14 derived from this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27#pragma once
28#include "tiny_dnn/util/util.h"
29#include "tiny_dnn/util/math_functions.h"
30#include "tiny_dnn/layers/layer.h"
31
32#include <algorithm>
33
34namespace tiny_dnn {
35
36
43public:
44 typedef layer Base;
45
53 float_t epsilon = 1e-5,
54 float_t momentum = 0.999,
55 net_phase phase = net_phase::train)
56 : Base({ vector_type::data }, { vector_type::data }),
57 in_channels_(prev_layer.out_shape()[0].depth_),
58 in_spatial_size_(prev_layer.out_shape()[0].area()),
59 phase_(phase),
60 momentum_(momentum),
61 eps_(epsilon),
62 update_immidiately_(false)
63 {
64 init();
65 }
66
75 serial_size_t in_channels,
76 float_t epsilon = 1e-5,
77 float_t momentum = 0.999,
78 net_phase phase = net_phase::train)
79 : Base({ vector_type::data }, { vector_type::data }),
80 in_channels_(in_channels),
81 in_spatial_size_(in_spatial_size),
82 phase_(phase),
83 momentum_(momentum),
84 eps_(epsilon),
85 update_immidiately_(false)
86 {
87 init();
88 }
89
91
93 serial_size_t fan_in_size() const override {
94 return 1;
95 }
96
98 serial_size_t fan_out_size() const override {
99 return 1;
100 }
101
102 std::vector<index3d<serial_size_t>> in_shape() const override {
103 return{ index3d<serial_size_t>(in_spatial_size_, 1, in_channels_) };
104 }
105
106 std::vector<index3d<serial_size_t>> out_shape() const override {
107 return{ index3d<serial_size_t>(in_spatial_size_, 1, in_channels_) };
108 }
109
110 void back_propagation(const std::vector<tensor_t*>& in_data,
111 const std::vector<tensor_t*>& out_data,
112 std::vector<tensor_t*>& out_grad,
113 std::vector<tensor_t*>& in_grad) override {
114 tensor_t& prev_delta = *in_grad[0];
115 tensor_t& curr_delta = *out_grad[0];
116 const tensor_t& curr_out = *out_data[0];
117 serial_size_t num_samples = static_cast<serial_size_t>(curr_out.size());
118
119 CNN_UNREFERENCED_PARAMETER(in_data);
120
121 tensor_t delta_dot_y = curr_out;
123
124 for (serial_size_t i = 0; i < num_samples; i++) {
125 for (serial_size_t j = 0; j < curr_out[0].size(); j++) {
126 delta_dot_y[i][j] *= curr_delta[i][j];
127 }
128 }
129 moments(delta_dot_y, in_spatial_size_, in_channels_, &mean_delta_dot_y, nullptr);
130 moments(curr_delta, in_spatial_size_, in_channels_, &mean_delta, nullptr);
131
132 // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
133 //
134 // dE(Y)/dX =
135 // (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y)
136 // ./ sqrt(var(X) + eps)
137 //
138 for_i(num_samples, [&](int i) {
139 for (serial_size_t j = 0; j < in_channels_; j++) {
140 for (serial_size_t k = 0; k < in_spatial_size_; k++) {
141 serial_size_t index = j*in_spatial_size_ + k;
142
143 prev_delta[i][index]
144 = curr_delta[i][index] - mean_delta[j] - mean_delta_dot_y[j] * curr_out[i][index];
145
146 // stddev_ is calculated in the forward pass
147 prev_delta[i][index] /= stddev_[j];
148 }
149 }
150 });
151 }
152
153 void forward_propagation(const std::vector<tensor_t*>& in_data,
154 std::vector<tensor_t*>& out_data) override {
155 vec_t* mean = nullptr;
156 vec_t* variance = nullptr;
157 tensor_t& in = *in_data[0];
158 tensor_t& out = *out_data[0];
159
160 if (phase_ == net_phase::train) {
161 // calculate mean/variance from this batch in train phase
162 mean = &mean_current_;
163 variance = &variance_current_;
164 moments(*in_data[0], in_spatial_size_, in_channels_, mean, variance);
165 }
166 else {
167 // use stored mean/variance in test phase
168 mean = &mean_;
169 variance = &variance_;
170 }
171
172 // y = (x - mean) ./ sqrt(variance + eps)
173 calc_stddev(*variance);
174
175 for_i(parallelize_, in_data[0]->size(), [&](int i) {
176 const float_t* inptr = &in[i][0];
177 float_t* outptr = &out[i][0];
178
179 for (serial_size_t j = 0; j < in_channels_; j++) {
180 float_t m = (*mean)[j];
181
182 for (serial_size_t k = 0; k < in_spatial_size_; k++) {
183 *outptr++ = (*inptr++ - m) / stddev_[j];
184 }
185 }
186 });
187
188 if (phase_ == net_phase::train && update_immidiately_) {
189 mean_ = mean_current_;
190 variance_ = variance_current_;
191 }
192 }
193
194 void set_context(net_phase ctx) override
195 {
196 phase_ = ctx;
197 }
198
199 std::string layer_type() const override { return "batch-norm"; }
200
201 virtual void post_update() override {
202 for (serial_size_t i = 0; i < mean_.size(); i++) {
203 mean_[i] = momentum_ * mean_[i] + (1 - momentum_) * mean_current_[i];
204 variance_[i] = momentum_ * variance_[i] + (1 - momentum_) * variance_current_[i];
205 }
206 }
207
208 virtual void save(std::ostream& os) const override {
209 Base::save(os);
210 for (auto m : mean_) os << m << " ";
211 for (auto v : variance_) os << v << " ";
212 }
213
214 virtual void load(std::istream& is) override {
215 Base::load(is);
216 for (auto& m : mean_) is >> m;
217 for (auto& v : variance_) is >> v;
218 }
219
220 virtual void load(const std::vector<float_t>& src, int& idx) override {
221 Base::load(src, idx);
222 for (auto& m : mean_) m = src[idx++];
223 for (auto& v : variance_) v = src[idx++];
224 }
225
226 void update_immidiately(bool update) {
227 update_immidiately_ = update;
228 }
229
230 void set_stddev(const vec_t& stddev) {
231 stddev_ = stddev;
232 }
233
234 void set_mean(const vec_t& mean) {
235 mean_ = mean;
236 }
237
238 void set_variance(const vec_t& variance) {
239 variance_ = variance;
240 calc_stddev(variance);
241 }
242
243 template <class Archive>
244 static void load_and_construct(Archive & ar, cereal::construct<batch_normalization_layer> & construct) {
245 shape3d in;
246 serial_size_t in_spatial_size, in_channels;
247 float_t eps, momentum;
248 net_phase phase;
249 vec_t mean, variance;
250
251 ar(cereal::make_nvp("in_spatial_size", in_spatial_size),
252 cereal::make_nvp("in_channels", in_channels),
253 cereal::make_nvp("epsilon", eps),
254 cereal::make_nvp("momentum", momentum),
255 cereal::make_nvp("phase", phase),
256 cereal::make_nvp("mean", mean),
257 cereal::make_nvp("variance", variance));
258 construct(in_spatial_size, in_channels, eps, momentum, phase);
259 construct->set_mean(mean);
260 construct->set_variance(variance);
261 }
262
263 template <class Archive>
264 void serialize(Archive & ar) {
265 layer::serialize_prolog(ar);
266 ar(cereal::make_nvp("in_spatial_size", in_spatial_size_),
267 cereal::make_nvp("in_channels", in_channels_),
268 cereal::make_nvp("epsilon", eps_),
269 cereal::make_nvp("momentum", momentum_),
270 cereal::make_nvp("phase", phase_),
271 cereal::make_nvp("mean", mean_),
272 cereal::make_nvp("variance", variance_));
273 }
274
275 float_t epsilon() const {
276 return eps_;
277 }
278
279 float_t momentum() const {
280 return momentum_;
281 }
282
283private:
284 void calc_stddev(const vec_t& variance) {
285 for (size_t i = 0; i < in_channels_; i++) {
286 stddev_[i] = sqrt(variance[i] + eps_);
287 }
288 }
289
290 void init() {
291 mean_current_.resize(in_channels_);
292 mean_.resize(in_channels_);
293 variance_current_.resize(in_channels_);
294 variance_.resize(in_channels_);
295 tmp_mean_.resize(in_channels_);
296 stddev_.resize(in_channels_);
297 }
298
299 serial_size_t in_channels_;
300 serial_size_t in_spatial_size_;
301
302 net_phase phase_;
303 float_t momentum_;
304 float_t eps_;
305
306 // mean/variance for this mini-batch
307 vec_t mean_current_;
308 vec_t variance_current_;
309
310 vec_t tmp_mean_;
311
312 // moving average of mean/variance
313 vec_t mean_;
314 vec_t variance_;
315 vec_t stddev_;
316
317 // for test
318 bool update_immidiately_;
319};
320
321} // namespace tiny_dnn
Batch Normalization.
Definition batch_normalization_layer.h:42
std::vector< index3d< serial_size_t > > out_shape() const override
array of output shapes (width x height x depth)
Definition batch_normalization_layer.h:106
void set_context(net_phase ctx) override
notify changing context (train <=> test)
Definition batch_normalization_layer.h:194
void back_propagation(const std::vector< tensor_t * > &in_data, const std::vector< tensor_t * > &out_data, std::vector< tensor_t * > &out_grad, std::vector< tensor_t * > &in_grad) override
return delta of previous layer (delta=\frac{dE}{da}, a=wx in fully-connected layer)
Definition batch_normalization_layer.h:110
virtual void post_update() override
return delta2 of previous layer (delta2=\frac{d^2E}{da^2}, diagonal of hessian matrix) it is never ca...
Definition batch_normalization_layer.h:201
void forward_propagation(const std::vector< tensor_t * > &in_data, std::vector< tensor_t * > &out_data) override
Definition batch_normalization_layer.h:153
std::string layer_type() const override
name of layer, should be unique for each concrete class
Definition batch_normalization_layer.h:199
serial_size_t fan_in_size() const override
number of outgoing connections for each input unit
Definition batch_normalization_layer.h:93
virtual ~batch_normalization_layer()
number of incoming connections for each output unit
Definition batch_normalization_layer.h:90
std::vector< index3d< serial_size_t > > in_shape() const override
array of input shapes (width x height x depth)
Definition batch_normalization_layer.h:102
batch_normalization_layer(const layer &prev_layer, float_t epsilon=1e-5, float_t momentum=0.999, net_phase phase=net_phase::train)
Definition batch_normalization_layer.h:52
serial_size_t fan_out_size() const override
number of outgoing connections for each input unit used only for weight/bias initialization methods w...
Definition batch_normalization_layer.h:98
batch_normalization_layer(serial_size_t in_spatial_size, serial_size_t in_channels, float_t epsilon=1e-5, float_t momentum=0.999, net_phase phase=net_phase::train)
Definition batch_normalization_layer.h:74
Simple image utility class.
Definition image.h:94
base class of all kind of NN layers
Definition layer.h:62
bool parallelize_
Flag indicating whether the layer/node operations ara paralellized.
Definition layer.h:696
serial_size_t in_channels() const
number of outgoing edges in this layer
Definition layer.h:146
SGD with momentum.
Definition optimizer.h:178