tiny_dnn 1.0.0
A header only, dependency-free deep learning framework in C++11
Loading...
Searching...
No Matches
tiny_quantized_fully_connected_kernel.h
1/*
2 Copyright (c) 2016, Taiga Nomi, Edgar Riba
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in the
11 documentation and/or other materials provided with the distribution.
12 * Neither the name of the <organization> nor the
13 names of its contributors may be used to endorse or promote products
14 derived from this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27#pragma once
28
29#include "tiny_dnn/core/params/fully_params.h"
30#include "tiny_dnn/core/kernels/tiny_quantization_kernel.h"
31#include "tiny_dnn/core/kernels/tiny_quantized_matmul_kernel.h"
32
33namespace tiny_dnn {
34namespace core {
35namespace kernels {
36
37inline void tiny_quantized_fully_connected_kernel(const fully_params& params,
38 const vec_t& in,
39 const vec_t& W,
40 const vec_t& b,
41 vec_t& a,
42 const bool layer_parallelize) {
43 // input quantization
44 float_t min_input(in[0]);
45 float_t max_input(in[0]);
46 for (serial_size_t c = 0; c < params.in_size_; c++) {
47 min_input = std::min(min_input, in[c]);
48 max_input = std::max(max_input, in[c]);
49 }
50 std::vector<uint8_t> in_quantized =
51 float_tensor_to_quantized<uint8_t>(in, min_input, max_input);
52 // filter quantization
53 float_t min_filter(W[0]);
54 float_t max_filter(W[0]);
55 for (serial_size_t c = 0; c < W.size(); c++) {
56 min_filter = std::min(min_filter, W[c]);
57 max_filter = std::max(max_filter, W[c]);
58 }
59 if (min_filter == max_filter) {
60 max_filter = W[0] + 1e-3f;
61 min_filter = W[0] - 1e-3f;
62 }
63 std::vector<uint8_t> W_quantized =
64 float_tensor_to_quantized<uint8_t>(W, min_filter, max_filter);
65 // output range
66 float_t min_output_value;
67 float_t max_output_value;
68 quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
69 min_input, max_input, min_filter, max_filter, &min_output_value,
70 &max_output_value);
71 // bias quantization
72 float_t min_bias(0);
73 float_t max_bias(0);
74 std::vector<uint8_t> bias_quantized;
75 if (params.has_bias_) {
76 for (serial_size_t inc = 0; inc < b.size(); inc++) {
77 min_bias = std::min(min_bias, b[inc]);
78 max_bias = std::max(max_bias, b[inc]);
79 }
80 if (min_bias == max_bias) {
81 max_bias = b[0] + 1e-3f;
82 min_bias = b[0] - 1e-3f;
83 }
84 bias_quantized =
85 float_tensor_to_quantized<uint8_t>(b, min_bias, max_bias);
86 }
87 min_output_value += min_bias;
88 max_output_value += max_bias;
89
90 std::vector<int32_t> a_quantized(a.size(), static_cast<int32_t>(0));
91
92 // calculating offset
93 const int32_t offset_input =
94 float_to_quantized_unclamped<uint8_t>(0.0f, min_input, max_input);
95 const int32_t offset_filter =
96 float_to_quantized_unclamped<uint8_t>(0.0f, min_filter, max_filter);
97 const int32_t zero_in_total_space =
98 float_to_quantized<int32_t>(0.0f, min_output_value, max_output_value);
99
100 const int32_t offset_output = 0;
101 const int32_t mult_output = 1;
102 const int32_t shift_output = 0;
103
104 bool use_gemm = false;
105 if (use_gemm) {
106 std::vector<size_t> shape{params.in_size_, 1, params.out_size_, params.in_size_};
107 tiny_quantized_matmul(in_quantized,
108 W_quantized,
109 a_quantized,
110 shape,
111 offset_input,
112 offset_filter,
113 offset_output,
114 mult_output,
115 shift_output);
116 if (params.has_bias_) {
117 for_i(layer_parallelize, params.out_size_, [&](int i) {
118 a[i] += b[i];
119 });
120 }
121 } else {
122 for_i(layer_parallelize, params.out_size_, [&](int i) {
123 for (serial_size_t c = 0; c < params.in_size_; c++) {
124 a_quantized[i] += static_cast<int32_t>(W_quantized[c * params.out_size_ + i] - offset_filter) *
125 static_cast<int32_t>(in_quantized[c] - offset_input);
126 }
127 if (params.has_bias_) {
128 a_quantized[i] += (bias_quantized[i] - zero_in_total_space);
129 }
130 });
131 }
132
133 float_t min_output_requantized;
134 float_t max_output_requantized;
135 std::vector<uint8_t> a_requantized(a_quantized.size(), static_cast<uint8_t>(0));
136
137 // Requantize from 32bits to 8 bits for next layer
138 quantize_down_and_shrink_range<int32_t, uint8_t>(a_quantized, min_output_value, max_output_value,
139 &min_output_requantized, &max_output_requantized, &a_requantized);
140
141 // dequantize to flaot, this could be removed within concatenated quantized network
142 a = quantized_tensor_to_float<uint8_t>(a_requantized, min_output_requantized, max_output_requantized);
143}
144
145inline void tiny_quantized_fully_connected_back_kernel(const fully_params& params,
146 const vec_t& prev_out,
147 const vec_t& W,
148 vec_t& dW,
149 vec_t& prev_delta,
150 vec_t& curr_delta,
151 vec_t& db,
152 const bool layer_parallelize) {
153 // previous output quantization
154 float_t min_prev_out(prev_out[0]);
155 float_t max_prev_out(prev_out[0]);
156 for (serial_size_t inc = 0; inc < prev_out.size(); inc++) {
157 min_prev_out = std::min(min_prev_out, prev_out[inc]);
158 max_prev_out = std::max(min_prev_out, prev_out[inc]);
159 }
160 std::vector<uint8_t> prev_out_quantized =
161 float_tensor_to_quantized<uint8_t>(prev_out, min_prev_out, max_prev_out);
162
163 // filter quantization
164 float_t min_filter(W[0]);
165 float_t max_filter(W[0]);
166 for (serial_size_t c = 0; c < W.size(); c++) {
167 min_filter = std::min(min_filter, W[c]);
168 max_filter = std::max(max_filter, W[c]);
169 }
170 if (min_filter == max_filter) {
171 max_filter = W[0] + 1e-3f;
172 min_filter = W[0] - 1e-3f;
173 }
174 std::vector<uint8_t> W_quantized =
175 float_tensor_to_quantized<uint8_t>(W, min_filter, max_filter);
176
177 // current delta quantization
178 float_t min_curr_delta(curr_delta[0]);
179 float_t max_curr_delta(curr_delta[0]);
180 for (serial_size_t inc = 0; inc < curr_delta.size(); inc++) {
181 min_curr_delta = std::min(min_curr_delta, curr_delta[inc]);
182 max_curr_delta = std::max(max_curr_delta, curr_delta[inc]);
183 }
184 std::vector<uint8_t> curr_delta_quantized =
185 float_tensor_to_quantized<uint8_t>(curr_delta, min_curr_delta, max_curr_delta);
186
187 // output range for previous delta
188 float_t min_prev_delta_value;
189 float_t max_prev_delta_value;
190 quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
191 min_curr_delta, max_curr_delta, min_filter, max_filter, &min_prev_delta_value,
192 &max_prev_delta_value);
193
194 std::vector<int32_t> prev_delta_quantized(prev_delta.size(), static_cast<int32_t>(0));
195
196 // output range for dW
197 float_t min_dW_value;
198 float_t max_dW_value;
199 quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
200 min_curr_delta, max_curr_delta, min_prev_out, max_prev_out, &min_dW_value,
201 &max_dW_value);
202
203 std::vector<int32_t> dW_quantized(dW.size(), static_cast<int32_t>(0));
204
205 // calculating offset
206 const int32_t offset_prev_out =
207 float_to_quantized_unclamped<uint8_t>(0.0f, min_prev_out, max_prev_out);
208 const int32_t offset_filter =
209 float_to_quantized_unclamped<uint8_t>(0.0f, min_filter, max_filter);
210 const int32_t offset_curr_delta =
211 float_to_quantized_unclamped<uint8_t>(0.0f, min_curr_delta, max_curr_delta);
212 //const int32_t zero_in_prev_delta =
213 // float_to_quantized<int32_t>(0.0f, min_prev_delta_value, max_prev_delta_value);
214
215 for (serial_size_t c = 0; c < params.in_size_; c++) {
216 // propagate delta to previous layer
217 // prev_delta[c] += current_delta[r] * W_[c * out_size_ + r]
218 for (serial_size_t io = 0; io < params.out_size_; io++) {
219 prev_delta_quantized[c] += (static_cast<int32_t>(curr_delta_quantized[io]) - offset_curr_delta)
220 * (static_cast<int32_t>(W_quantized[c * params.out_size_ + io]) - offset_filter);
221 }
222 }
223
224 float_t min_prev_delta_requantized;
225 float_t max_prev_delta_requantized;
226 std::vector<uint8_t> prev_delta_requantized(prev_delta_quantized.size(), static_cast<uint8_t>(0));
227
228 // Requantize from 32bits to 8 bits for next layer
229 quantize_down_and_shrink_range<int32_t, uint8_t>(prev_delta_quantized, min_prev_delta_value, max_prev_delta_value,
230 &min_prev_delta_requantized, &max_prev_delta_requantized, &prev_delta_requantized);
231
232 // dequantize to flaot, this could be removed within concatenated quantized network
233 prev_delta = quantized_tensor_to_float<uint8_t>(prev_delta_requantized, min_prev_delta_requantized, max_prev_delta_requantized);
234
235 for_(layer_parallelize, 0, size_t(params.out_size_), [&](const blocked_range& r) {
236 // accumulate weight-step using delta
237 // dW[c * out_size + i] += current_delta[i] * prev_out[c]
238 for (serial_size_t c = 0; c < params.in_size_; c++) {
239 for (serial_size_t io = 0; io < params.out_size_; io++) {
240 dW_quantized[c * params.out_size_ + io] += (static_cast<int32_t>(curr_delta_quantized[io]) - offset_curr_delta)
241 * (static_cast<int32_t>(prev_out_quantized[c]) - offset_prev_out);
242 }
243 }
244
245 if (params.has_bias_) {
246 // vec_t& db = *in_grad[2];
247 for (int i = r.begin(); i < r.end(); i++) {
248 db[i] += curr_delta[i];
249 }
250 }
251 });
252
253 float_t min_dW_requantized;
254 float_t max_dW_requantized;
255 std::vector<uint8_t> dW_requantized(dW_quantized.size(), static_cast<uint8_t>(0));
256
257 // requantize from 32bits to 8 bits for next layer
258 quantize_down_and_shrink_range<int32_t, uint8_t>(dW_quantized, min_dW_value, max_dW_value,
259 &min_dW_requantized, &max_dW_requantized, &dW_requantized);
260
261 // dequantize to flaot, this could be removed within concatenated quantized network
262 dW = quantized_tensor_to_float<uint8_t>(dW_requantized, min_dW_requantized, max_dW_requantized);
263}
264
265inline void tiny_quantized_fully_connected_kernel(const fully_params& params,
266 const vec_t& in,
267 const vec_t& W,
268 const vec_t& b,
269 const vec_t& in_r,
270 const vec_t& W_r,
271 const vec_t& b_r,
272 vec_t& a,
273 vec_t& a_r,
274 const bool layer_parallelize) {
275 // filter range
276 float_t min_filter(W_r[0]);
277 float_t max_filter(W_r[1]);
278 if (min_filter == max_filter) {
279 max_filter = W_r[1] + 1e-3f;
280 min_filter = W_r[0] - 1e-3f;
281 }
282 // bias range
283 float_t min_bias(b_r[0]);
284 float_t max_bias(b_r[1]);
285 if (params.has_bias_) {
286 if (min_bias == max_bias) {
287 max_bias = b_r[1] + 1e-3f;
288 min_bias = b_r[0] - 1e-3f;
289 }
290 }
291 // output range
292 float_t min_output_value;
293 float_t max_output_value;
294 quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
295 in_r[0], in_r[1], min_filter, max_filter, &min_output_value,
296 &max_output_value);
297 // data type restore
298 std::vector<uint8_t> in_quantized, W_quantized, bias_quantized;
299 for (size_t i = 0; i < in.size(); i++) {
300 in_quantized.push_back(static_cast<uint8_t>(in[i]));
301 }
302 for (size_t i = 0; i < W.size(); i++) {
303 W_quantized.push_back(static_cast<uint8_t>(W[i]));
304 }
305 for (size_t i = 0; i < b.size(); i++) {
306 bias_quantized.push_back(static_cast<uint8_t>(b[i]));
307 }
308 min_output_value += min_bias;
309 max_output_value += max_bias;
310
311 std::vector<int32_t> a_quantized(a.size(), static_cast<int32_t>(0));
312
313 // calculating offset
314 const int32_t offset_input =
315 float_to_quantized_unclamped<uint8_t>(0.0f, in_r[0], in_r[1]);
316 const int32_t offset_filter =
317 float_to_quantized_unclamped<uint8_t>(0.0f, min_filter, max_filter);
318 const int32_t zero_in_total_space =
319 float_to_quantized<int32_t>(0.0f, min_output_value, max_output_value);
320
321 const int32_t offset_output = 0;
322 const int32_t mult_output = 1;
323 const int32_t shift_output = 0;
324
325 bool use_gemm = false;
326 if (use_gemm) {
327 std::vector<size_t> shape{params.in_size_, 1, params.out_size_, params.in_size_};
328 tiny_quantized_matmul(in_quantized,
329 W_quantized,
330 a_quantized,
331 shape,
332 offset_input,
333 offset_filter,
334 offset_output,
335 mult_output,
336 shift_output);
337 if (params.has_bias_) {
338 for_i(layer_parallelize, params.out_size_, [&](int i) {
339 a[i] += b[i];
340 });
341 }
342 } else {
343 for_i(layer_parallelize, params.out_size_, [&](int i) {
344 for (serial_size_t c = 0; c < params.in_size_; c++) {
345 a_quantized[i] += static_cast<int32_t>(W_quantized[c * params.out_size_ + i] - offset_filter) *
346 static_cast<int32_t>(in_quantized[c] - offset_input);
347 }
348 if (params.has_bias_) {
349 a_quantized[i] += (bias_quantized[i] - zero_in_total_space);
350 }
351 });
352 }
353
354 float_t min_output_requantized;
355 float_t max_output_requantized;
356 std::vector<uint8_t> a_requantized(a_quantized.size(), static_cast<uint8_t>(0));
357
358 // Requantize from 32bits to 8 bits for next layer
359 quantize_down_and_shrink_range<int32_t, uint8_t>(a_quantized, min_output_value, max_output_value,
360 &min_output_requantized, &max_output_requantized, &a_requantized);
361 // store directly in float datatype
362 for (size_t i = 0; i < a_requantized.size(); i++) {
363 a[i] = static_cast<float>(a_requantized[i]);
364 }
365 a_r[0] = min_output_requantized;
366 a_r[1] = max_output_requantized;
367}
368
369} // namespace kernels
370} // namespace core
371} // namespace tiny_dnn