64 const unsigned mu1 = 0x55555555;
65 const unsigned mu2 = 0x33333333;
66 const unsigned mu3 = 0x0F0F0F0F;
67 const unsigned mu4 = 0x0000003F;
70 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
71 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
72 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
73 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
75 mcnt = _mm_xor_si128(m1, m1);
80 __m128i b = _mm_load_si128(block);
84 tmp1 = _mm_srli_epi32(b, 1);
85 tmp1 = _mm_and_si128(tmp1, m1);
86 tmp2 = _mm_and_si128(b, m1);
87 b = _mm_add_epi32(tmp1, tmp2);
90 tmp1 = _mm_srli_epi32(b, 2);
91 tmp1 = _mm_and_si128(tmp1, m2);
92 tmp2 = _mm_and_si128(b, m2);
93 b = _mm_add_epi32(tmp1, tmp2);
96 tmp1 = _mm_srli_epi32(b, 4);
97 b = _mm_add_epi32(b, tmp1);
98 b = _mm_and_si128(b, m3);
101 tmp1 = _mm_srli_epi32 (b, 8);
102 b = _mm_add_epi32(b, tmp1);
105 tmp1 = _mm_srli_epi32 (b, 16);
106 b = _mm_add_epi32(b, tmp1);
107 b = _mm_and_si128(b, m4);
109 mcnt = _mm_add_epi32(mcnt, b);
111 }
while (block < block_end);
115 _mm_store_si128((__m128i*)tcnt, mcnt);
117 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
128 const unsigned mu1 = 0x55555555;
129 const unsigned mu2 = 0x33333333;
130 const unsigned mu3 = 0x0F0F0F0F;
131 const unsigned mu4 = 0x0000003F;
134 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
135 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
136 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
137 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
139 mcnt = _mm_xor_si128(m1, m1);
143 __m128i b = _mm_load_si128(block++);
145 tmp1 = _mm_load_si128(mask_block++);
147 b = sse2_func(b, tmp1);
150 tmp1 = _mm_srli_epi32(b, 1);
151 tmp1 = _mm_and_si128(tmp1, m1);
152 tmp2 = _mm_and_si128(b, m1);
153 b = _mm_add_epi32(tmp1, tmp2);
156 tmp1 = _mm_srli_epi32(b, 2);
157 tmp1 = _mm_and_si128(tmp1, m2);
158 tmp2 = _mm_and_si128(b, m2);
159 b = _mm_add_epi32(tmp1, tmp2);
162 tmp1 = _mm_srli_epi32(b, 4);
163 b = _mm_add_epi32(b, tmp1);
164 b = _mm_and_si128(b, m3);
167 tmp1 = _mm_srli_epi32 (b, 8);
168 b = _mm_add_epi32(b, tmp1);
171 tmp1 = _mm_srli_epi32 (b, 16);
172 b = _mm_add_epi32(b, tmp1);
173 b = _mm_and_si128(b, m4);
175 mcnt = _mm_add_epi32(mcnt, b);
177 }
while (block < block_end);
180 _mm_store_si128((__m128i*)tcnt, mcnt);
182 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
191 const unsigned mu1 = 0x55555555;
192 const unsigned mu2 = 0x33333333;
193 const unsigned mu3 = 0x0F0F0F0F;
194 const unsigned mu4 = 0x0000003F;
197 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
198 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
199 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
200 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
202 mcnt = _mm_xor_si128(m1, m1);
207 int count = (int)(block_end - block)*4;
210 const int w_shift =
sizeof(w) * 8 - 1;
211 bool first_word =
true;
219 count -= (w_prev = (w0 >> w_shift));
229 __m128i b = _mm_load_si128(block);
232 tmp1 = _mm_srli_epi32(b, 1);
233 tmp2 = _mm_xor_si128(b, tmp1);
234 _mm_store_si128((__m128i*)tcnt, tmp2);
242 tmp1 = _mm_and_si128(tmp1, m1);
243 tmp2 = _mm_and_si128(b, m1);
244 b = _mm_add_epi32(tmp1, tmp2);
247 tmp1 = _mm_srli_epi32(b, 2);
248 tmp1 = _mm_and_si128(tmp1, m2);
249 tmp2 = _mm_and_si128(b, m2);
250 b = _mm_add_epi32(tmp1, tmp2);
253 tmp1 = _mm_srli_epi32(b, 4);
254 b = _mm_add_epi32(b, tmp1);
255 b = _mm_and_si128(b, m3);
258 tmp1 = _mm_srli_epi32 (b, 8);
259 b = _mm_add_epi32(b, tmp1);
262 tmp1 = _mm_srli_epi32 (b, 16);
263 b = _mm_add_epi32(b, tmp1);
264 b = _mm_and_si128(b, m4);
266 mcnt = _mm_add_epi32(mcnt, b);
287 count -= !(w_prev ^ (w0 & 1));
288 count -= w_prev = (w0 >> w_shift);
292 count -= !w_prev; w_prev ^= w_prev;
298 count -= !(w_prev ^ (w0 & 1));
299 count -= w_prev = (w0 >> w_shift);
303 count -= !w_prev; w_prev ^= w_prev;
308 count -= !(w_prev ^ (w0 & 1));
309 count -= w_prev = (w0 >> w_shift);
313 count -= !w_prev; w_prev ^= w_prev;
318 count -= !(w_prev ^ (w0 & 1));
319 count -= w_prev = (w0 >> w_shift);
323 count -= !w_prev; w_prev ^= w_prev;
326 }
while (++block < block_end);
328 _mm_store_si128((__m128i*)tcnt, mcnt);
329 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
331 return unsigned(count);
350 const unsigned unroll_factor = 8;
354 for (j = 0; j < size; ++j)
362 __m128i m1, mz, maskF, maskFL;
364 mz = _mm_setzero_si128();
365 m1 = _mm_loadu_si128((__m128i*)(pbuf));
367 maskF = _mm_cmpeq_epi32(mz, mz);
368 maskFL = _mm_slli_si128(maskF, 4 * 2);
369 int shiftL = (64 - (unroll_factor - size) * 16);
370 maskFL = _mm_slli_epi64(maskFL, shiftL);
372 m1 = _mm_andnot_si128(maskFL, m1);
373 m1 = _mm_or_si128(m1, maskFL);
375 __m128i mp = _mm_set1_epi16(pos);
376 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
377 int mi = _mm_movemask_epi8(mge_mask);
391 m1 = _mm_loadu_si128((__m128i*)(pbuf2));
392 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz);
393 mi = _mm_movemask_epi8(mge_mask);
397 return size - (unroll_factor - bsr_i);