Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_resize.neon.c
1/*
2 * Copyright 2013-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <arm_neon.h>
29
30#include "NE10.h"
31#include "NE10_mask_table.h"
32
33#define INTER_RESIZE_COEF_BITS (11)
34#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
35#define BITS (INTER_RESIZE_COEF_BITS*2)
36#define DELTA (1 << (INTER_RESIZE_COEF_BITS*2 - 1))
37
38void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count,
39 const int* xofs, const short* alpha,
40 int swidth, int dwidth, int cn, int xmin, int xmax)
41{
42 int dx, k;
43 int dx0 = 0;
44
45 int16x4x2_t alpha_vec;
46
47 uint8x8_t dS0_vec, dS1_vec;
48 int16x8_t qS0_vec, qS1_vec;
49 int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567;
50
51 int32x4_t qT0_vec, qT1_vec;
52
53 int16x4_t dCoeff;
54 dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE);
55
56 for (k = 0; k <= count - 2; k++)
57 {
58 const unsigned char *S0 = src[k], *S1 = src[k + 1];
59 int *D0 = dst[k], *D1 = dst[k + 1];
60
61 for (dx = dx0; dx < xmax; dx += 4)
62 {
63 int sx = xofs[dx];
64
65 alpha_vec = vld2_s16 (&alpha[dx * 2]);
66
67 dS0_vec = vld1_u8 (&S0[sx]);
68 dS1_vec = vld1_u8 (&S1[sx]);
69
70 qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
71 qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));
72
73 dS0_0123 = vget_low_s16 (qS0_vec);
74 dS0_4567 = vget_high_s16 (qS0_vec);
75 dS1_0123 = vget_low_s16 (qS1_vec);
76 dS1_4567 = vget_high_s16 (qS1_vec);
77
78 qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
79 qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]);
80 qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
81 qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]);
82
83 vst1q_s32 (&D0[dx], qT0_vec);
84 vst1q_s32 (&D1[dx], qT1_vec);
85 }
86
87 for (; dx < dwidth; dx += 4)
88 {
89 int sx = xofs[dx];
90
91 dS0_vec = vld1_u8 (&S0[sx]);
92 dS1_vec = vld1_u8 (&S1[sx]);
93
94 qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
95 qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));
96
97 dS0_0123 = vget_low_s16 (qS0_vec);
98 dS1_0123 = vget_low_s16 (qS1_vec);
99
100 qT0_vec = vmull_s16 (dS0_0123, dCoeff);
101 qT1_vec = vmull_s16 (dS1_0123, dCoeff);
102
103 vst1q_s32 (&D0[dx], qT0_vec);
104 vst1q_s32 (&D1[dx], qT1_vec);
105 }
106 }
107
108 for (; k < count; k++)
109 {
110 const unsigned char *S = src[k];
111 int *D = dst[k];
112 for (dx = 0; dx < xmax; dx += 4)
113 {
114 int sx = xofs[dx];
115
116 alpha_vec = vld2_s16 (&alpha[dx * 2]);
117
118 dS0_vec = vld1_u8 (&S[sx]);
119 qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
120
121 dS0_0123 = vget_low_s16 (qS0_vec);
122 dS0_4567 = vget_high_s16 (qS0_vec);
123
124 qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
125 qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
126
127 vst1q_s32 (&D[dx], qT0_vec);
128 }
129
130 for (; dx < dwidth; dx += 4)
131 {
132 int sx = xofs[dx];
133
134 dS0_vec = vld1_u8 (&S[sx]);
135 qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
136 dS0_0123 = vget_low_s16 (qS0_vec);
137 qT0_vec = vmull_s16 (dS0_0123, dCoeff);
138
139 vst1q_s32 (&D[dx], qT0_vec);
140 }
141 }
142}
143
144
145void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width)
146{
147 const int *S0 = src[0], *S1 = src[1];
148
149 int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;
150 int32x4_t qT_0123, qT_4567;
151 int16x4_t dT_0123, dT_4567;
152 uint16x8_t qT_01234567;
153 uint8x8_t dT_01234567, dDst_01234567;
154
155 int32x2_t dBeta = {};
156 dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0);
157 dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1);
158
159 int32x4_t qDelta, qMin, qMax;
160 qDelta = vdupq_n_s32 (DELTA);
161 qMin = vdupq_n_s32 (0);
162 qMax = vdupq_n_s32 (255);
163
164 int x = 0;
165 for (; x <= width - 8; x += 8)
166 {
167 qS0_0123 = vld1q_s32 (&S0[x]);
168 qS0_4567 = vld1q_s32 (&S0[x + 4]);
169 qS1_0123 = vld1q_s32 (&S1[x]);
170 qS1_4567 = vld1q_s32 (&S1[x + 4]);
171
172 qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
173 qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
174 qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
175 qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);
176
177 qT_0123 = vaddq_s32 (qT_0123, qDelta);
178 qT_4567 = vaddq_s32 (qT_4567, qDelta);
179
180 qT_0123 = vshrq_n_s32 (qT_0123, BITS);
181 qT_4567 = vshrq_n_s32 (qT_4567, BITS);
182
183 qT_0123 = vmaxq_s32 (qT_0123, qMin);
184 qT_4567 = vmaxq_s32 (qT_4567, qMin);
185 qT_0123 = vminq_s32 (qT_0123, qMax);
186 qT_4567 = vminq_s32 (qT_4567, qMax);
187
188 dT_0123 = vmovn_s32 (qT_0123);
189 dT_4567 = vmovn_s32 (qT_4567);
190 qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
191 dT_01234567 = vmovn_u16 (qT_01234567);
192
193 vst1_u8 (&dst[x], dT_01234567);
194 }
195
196 if (x < width)
197 {
198 uint8x8_t dMask;
199 dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)]));
200 dDst_01234567 = vld1_u8 (&dst[x]);
201
202 qS0_0123 = vld1q_s32 (&S0[x]);
203 qS0_4567 = vld1q_s32 (&S0[x + 4]);
204 qS1_0123 = vld1q_s32 (&S1[x]);
205 qS1_4567 = vld1q_s32 (&S1[x + 4]);
206
207 qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
208 qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
209 qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
210 qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);
211
212 qT_0123 = vaddq_s32 (qT_0123, qDelta);
213 qT_4567 = vaddq_s32 (qT_4567, qDelta);
214
215 qT_0123 = vshrq_n_s32 (qT_0123, BITS);
216 qT_4567 = vshrq_n_s32 (qT_4567, BITS);
217
218 qT_0123 = vmaxq_s32 (qT_0123, qMin);
219 qT_4567 = vmaxq_s32 (qT_4567, qMin);
220 qT_0123 = vminq_s32 (qT_0123, qMax);
221 qT_4567 = vminq_s32 (qT_4567, qMax);
222
223 dT_0123 = vmovn_s32 (qT_0123);
224 dT_4567 = vmovn_s32 (qT_4567);
225 qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
226 dT_01234567 = vmovn_u16 (qT_01234567);
227
228 dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567);
229 vst1_u8 (&dst[x], dMask);
230 }
231}