BitMagic-C++
bmsse4.h
Go to the documentation of this file.
1#ifndef BMSSE4__H__INCLUDED__
2#define BMSSE4__H__INCLUDED__
3/*
4Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmsse4.h
22 \brief Compute functions for SSE4.2 SIMD instruction set (internal)
23*/
24
25#include<mmintrin.h>
26#include<emmintrin.h>
27#include<smmintrin.h>
28#include<nmmintrin.h>
29#include<immintrin.h>
30
31#include "bmdef.h"
32#include "bmsse_util.h"
33#include "bmutil.h"
34
35namespace bm
36{
37
38/** @defgroup SSE4 SSE4.2 funcions (internal)
39 Processor specific optimizations for SSE4.2 instructions (internals)
40 @internal
41 @ingroup bvector
42 */
43
44#ifdef __GNUG__
45#pragma GCC diagnostic push
46#pragma GCC diagnostic ignored "-Wconversion"
47#endif
48
49#ifdef _MSC_VER
50#pragma warning( push )
51#pragma warning( disable : 4146)
52#endif
53
54
55/*
56inline
57void sse2_print128(const char* prefix, const __m128i & value)
58{
59 const size_t n = sizeof(__m128i) / sizeof(unsigned);
60 unsigned buffer[n];
61 _mm_storeu_si128((__m128i*)buffer, value);
62 std::cout << prefix << " [ ";
63 for (int i = n-1; 1; --i)
64 {
65 std::cout << buffer[i] << " ";
66 if (i == 0)
67 break;
68 }
69 std::cout << "]" << std::endl;
70}
71*/
72
73/*!
74 SSE4.2 optimized bitcounting .
75 @ingroup SSE4
76*/
77inline
78bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end)
79{
80 bm::id_t count = 0;
81#ifdef BM64_SSE4
82 const bm::id64_t* b = (bm::id64_t*) block;
83 const bm::id64_t* b_end = (bm::id64_t*) block_end;
84 do
85 {
86 count += unsigned( _mm_popcnt_u64(b[0]) +
87 _mm_popcnt_u64(b[1]));
88 b += 2;
89 } while (b < b_end);
90#else
91 do
92 {
93 const unsigned* b = (unsigned*) block;
94 count += _mm_popcnt_u32(b[0]) +
95 _mm_popcnt_u32(b[1]) +
96 _mm_popcnt_u32(b[2]) +
97 _mm_popcnt_u32(b[3]);
98 } while (++block < block_end);
99#endif
100 return count;
101}
102
103/*!
104\internal
105*/
107unsigned op_xor(unsigned a, unsigned b)
108{
109 unsigned ret = (a ^ b);
110 return ret;
111}
112
113/*!
114\internal
115*/
117unsigned op_or(unsigned a, unsigned b)
118{
119 return (a | b);
120}
121
122/*!
123\internal
124*/
126unsigned op_and(unsigned a, unsigned b)
127{
128 return (a & b);
129}
130
131
132template<class Func>
134 const __m128i* BMRESTRICT block_end,
135 const __m128i* BMRESTRICT mask_block,
136 Func sse2_func)
137{
138 bm::id_t count = 0;
139#ifdef BM64_SSE4
140 do
141 {
142 __m128i tmp0 = _mm_load_si128(block);
143 __m128i tmp1 = _mm_load_si128(mask_block);
144 __m128i b = sse2_func(tmp0, tmp1);
145
146 count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 0));
147 count += (unsigned)_mm_popcnt_u64(_mm_extract_epi64(b, 1));
148
149 ++block; ++mask_block;
150 } while (block < block_end);
151#else
152 do
153 {
154 __m128i tmp0 = _mm_load_si128(block);
155 __m128i tmp1 = _mm_load_si128(mask_block);
156 __m128i b = sse2_func(tmp0, tmp1);
157
158 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
159 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
160 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
161 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
162
163 ++block; ++mask_block;
164 } while (block < block_end);
165#endif
166
167 return count;
168}
169
170/*!
171 @brief check if block is all zero bits
172 @ingroup SSE4
173*/
174inline
175bool sse4_is_all_zero(const __m128i* BMRESTRICT block)
176{
177 __m128i w;
178 __m128i maskz = _mm_setzero_si128();
179 const __m128i* BMRESTRICT block_end =
180 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
181
182 do
183 {
184 w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
185 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
186 return false;
187 w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
188 if (!_mm_test_all_ones(_mm_cmpeq_epi8(w, maskz))) // (w0 | w1) != maskz
189 return false;
190 block += 4;
191 } while (block < block_end);
192 return true;
193}
194
195/*!
196 @brief check if digest stride is all zero bits
197 @ingroup SSE4
198*/
199inline
200bool sse4_is_digest_zero(const __m128i* BMRESTRICT block)
201{
202 __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
203 __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
204 wA = _mm_or_si128(wA, wB);
205 bool z1 = _mm_test_all_zeros(wA, wA);
206
207 wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
208 wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
209 wA = _mm_or_si128(wA, wB);
210 bool z2 = _mm_test_all_zeros(wA, wA);
211 return z1 & z2;
212}
213
214/*!
215 @brief set digest stride to 0xFF.. or 0x0 value
216 @ingroup SSE4
217*/
218inline
219void sse4_block_set_digest(__m128i* dst, unsigned value)
220{
221 __m128i mV = _mm_set1_epi32(int(value));
222 _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
223 _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
224 _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
225 _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
226}
227
228
229/*!
230 @brief AND blocks2
231 *dst &= *src
232
233 @return 0 if no bits were set
234 @ingroup SSE4
235*/
236inline
237unsigned sse4_and_block(__m128i* BMRESTRICT dst,
238 const __m128i* BMRESTRICT src)
239{
240 __m128i m1A, m1B, m1C, m1D;
241 __m128i accA, accB, accC, accD;
242
243 const __m128i* BMRESTRICT src_end =
244 (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
245
246 accA = accB = accC = accD = _mm_setzero_si128();
247
248 do
249 {
250 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
251 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
252 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
253 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
254
255 _mm_store_si128(dst+0, m1A);
256 _mm_store_si128(dst+1, m1B);
257 _mm_store_si128(dst+2, m1C);
258 _mm_store_si128(dst+3, m1D);
259
260 accA = _mm_or_si128(accA, m1A);
261 accB = _mm_or_si128(accB, m1B);
262 accC = _mm_or_si128(accC, m1C);
263 accD = _mm_or_si128(accD, m1D);
264
265 src += 4; dst += 4;
266 } while (src < src_end);
267
268 accA = _mm_or_si128(accA, accB); // A = A | B
269 accC = _mm_or_si128(accC, accD); // C = C | D
270 accA = _mm_or_si128(accA, accC); // A = A | C
271
272 return !_mm_testz_si128(accA, accA);
273}
274
275
276/*!
277 @brief AND block digest stride
278 *dst &= *src
279
280 @return true if stide is all zero
281 @ingroup SSE4
282*/
283inline
284bool sse4_and_digest(__m128i* BMRESTRICT dst,
285 const __m128i* BMRESTRICT src)
286{
287 __m128i m1A, m1B, m1C, m1D;
288
289 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
290 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
291 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
292 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
293
294 _mm_store_si128(dst+0, m1A);
295 _mm_store_si128(dst+1, m1B);
296 _mm_store_si128(dst+2, m1C);
297 _mm_store_si128(dst+3, m1D);
298
299 m1A = _mm_or_si128(m1A, m1B);
300 m1C = _mm_or_si128(m1C, m1D);
301 m1A = _mm_or_si128(m1A, m1C);
302
303 bool z1 = _mm_testz_si128(m1A, m1A);
304
305 m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
306 m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
307 m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
308 m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
309
310 _mm_store_si128(dst+4, m1A);
311 _mm_store_si128(dst+5, m1B);
312 _mm_store_si128(dst+6, m1C);
313 _mm_store_si128(dst+7, m1D);
314
315 m1A = _mm_or_si128(m1A, m1B);
316 m1C = _mm_or_si128(m1C, m1D);
317 m1A = _mm_or_si128(m1A, m1C);
318
319 bool z2 = _mm_testz_si128(m1A, m1A);
320
321 return z1 & z2;
322}
323
324/*!
325 @brief AND block digest stride
326 *dst = *src1 & src2
327
328 @return true if stide is all zero
329 @ingroup SSE4
330*/
331inline
333 const __m128i* BMRESTRICT src1,
334 const __m128i* BMRESTRICT src2)
335{
336 __m128i m1A, m1B, m1C, m1D;
337
338 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
339 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
340 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
341 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
342
343 _mm_store_si128(dst+0, m1A);
344 _mm_store_si128(dst+1, m1B);
345 _mm_store_si128(dst+2, m1C);
346 _mm_store_si128(dst+3, m1D);
347
348 m1A = _mm_or_si128(m1A, m1B);
349 m1C = _mm_or_si128(m1C, m1D);
350 m1A = _mm_or_si128(m1A, m1C);
351
352 bool z1 = _mm_testz_si128(m1A, m1A);
353
354 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
355 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
356 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
357 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
358
359 _mm_store_si128(dst+4, m1A);
360 _mm_store_si128(dst+5, m1B);
361 _mm_store_si128(dst+6, m1C);
362 _mm_store_si128(dst+7, m1D);
363
364 m1A = _mm_or_si128(m1A, m1B);
365 m1C = _mm_or_si128(m1C, m1D);
366 m1A = _mm_or_si128(m1A, m1C);
367
368 bool z2 = _mm_testz_si128(m1A, m1A);
369
370 return z1 & z2;
371}
372
373/*!
374 @brief AND block digest stride
375 @return true if stide is all zero
376 @ingroup SSE4
377*/
378inline
380 const __m128i* BMRESTRICT src1,
381 const __m128i* BMRESTRICT src2,
382 const __m128i* BMRESTRICT src3,
383 const __m128i* BMRESTRICT src4)
384{
385 __m128i m1A, m1B, m1C, m1D;
386 __m128i m1E, m1F, m1G, m1H;
387
388 m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
389 m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
390 m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
391 m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
392
393 m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
394 m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
395 m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
396 m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
397
398 m1A = _mm_and_si128(m1A, m1E);
399 m1B = _mm_and_si128(m1B, m1F);
400 m1C = _mm_and_si128(m1C, m1G);
401 m1D = _mm_and_si128(m1D, m1H);
402
403 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
404 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
405 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
406 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
407
408 _mm_store_si128(dst+0, m1A);
409 _mm_store_si128(dst+1, m1B);
410 _mm_store_si128(dst+2, m1C);
411 _mm_store_si128(dst+3, m1D);
412
413 m1A = _mm_or_si128(m1A, m1B);
414 m1C = _mm_or_si128(m1C, m1D);
415 m1A = _mm_or_si128(m1A, m1C);
416
417 bool z1 = _mm_testz_si128(m1A, m1A);
418
419 m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
420 m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
421 m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
422 m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
423
424 m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
425 m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
426 m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
427 m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
428
429 m1A = _mm_and_si128(m1A, m1E);
430 m1B = _mm_and_si128(m1B, m1F);
431 m1C = _mm_and_si128(m1C, m1G);
432 m1D = _mm_and_si128(m1D, m1H);
433
434 m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
435 m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
436 m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
437 m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
438
439 _mm_store_si128(dst+4, m1A);
440 _mm_store_si128(dst+5, m1B);
441 _mm_store_si128(dst+6, m1C);
442 _mm_store_si128(dst+7, m1D);
443
444 m1A = _mm_or_si128(m1A, m1B);
445 m1C = _mm_or_si128(m1C, m1D);
446 m1A = _mm_or_si128(m1A, m1C);
447
448 bool z2 = _mm_testz_si128(m1A, m1A);
449
450 return z1 & z2;
451}
452
453
454/*!
455 @brief SUB (AND NOT) block digest stride
456 *dst &= ~*src
457
458 @return true if stide is all zero
459 @ingroup SSE4
460*/
461inline
462bool sse4_sub_digest(__m128i* BMRESTRICT dst,
463 const __m128i* BMRESTRICT src)
464{
465 __m128i m1A, m1B, m1C, m1D;
466
467 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
468 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
469 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
470 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
471
472 _mm_store_si128(dst+0, m1A);
473 _mm_store_si128(dst+1, m1B);
474 _mm_store_si128(dst+2, m1C);
475 _mm_store_si128(dst+3, m1D);
476
477 m1A = _mm_or_si128(m1A, m1B);
478 m1C = _mm_or_si128(m1C, m1D);
479 m1A = _mm_or_si128(m1A, m1C);
480
481 bool z1 = _mm_testz_si128(m1A, m1A);
482
483 m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
484 m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
485 m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
486 m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
487
488 _mm_store_si128(dst+4, m1A);
489 _mm_store_si128(dst+5, m1B);
490 _mm_store_si128(dst+6, m1C);
491 _mm_store_si128(dst+7, m1D);
492
493 m1A = _mm_or_si128(m1A, m1B);
494 m1C = _mm_or_si128(m1C, m1D);
495 m1A = _mm_or_si128(m1A, m1C);
496
497 bool z2 = _mm_testz_si128(m1A, m1A);
498
499 return z1 & z2;
500}
501
502
503/*!
504 @brief 2-operand SUB (AND NOT) block digest stride
505 *dst = src1 & ~*src2
506
507 @return true if stide is all zero
508 @ingroup SSE4
509*/
510inline
512 const __m128i* BMRESTRICT src1,
513 const __m128i* BMRESTRICT src2)
514{
515 __m128i m1A, m1B, m1C, m1D;
516
517 m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
518 m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
519 m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
520 m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
521
522 _mm_store_si128(dst+0, m1A);
523 _mm_store_si128(dst+1, m1B);
524 _mm_store_si128(dst+2, m1C);
525 _mm_store_si128(dst+3, m1D);
526
527 m1A = _mm_or_si128(m1A, m1B);
528 m1C = _mm_or_si128(m1C, m1D);
529 m1A = _mm_or_si128(m1A, m1C);
530
531 bool z1 = _mm_testz_si128(m1A, m1A);
532
533 m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
534 m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
535 m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
536 m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
537
538 _mm_store_si128(dst+4, m1A);
539 _mm_store_si128(dst+5, m1B);
540 _mm_store_si128(dst+6, m1C);
541 _mm_store_si128(dst+7, m1D);
542
543 m1A = _mm_or_si128(m1A, m1B);
544 m1C = _mm_or_si128(m1C, m1D);
545 m1A = _mm_or_si128(m1A, m1C);
546
547 bool z2 = _mm_testz_si128(m1A, m1A);
548
549 return z1 & z2;
550}
551
552
553
554/*!
555 @brief check if block is all zero bits
556 @ingroup SSE4
557*/
558inline
559bool sse4_is_all_one(const __m128i* BMRESTRICT block)
560{
561 __m128i w;
562 const __m128i* BMRESTRICT block_end =
563 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
564
565 do
566 {
567 w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
568 if (!_mm_test_all_ones(w))
569 return false;
570 w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
571 if (!_mm_test_all_ones(w))
572 return false;
573
574 block+=4;
575 } while (block < block_end);
576 return true;
577}
578
579/*!
580 @brief check if SSE wave is all oxFFFF...FFF
581 @ingroup SSE4
582*/
584bool sse42_test_all_one_wave(const void* ptr)
585{
586 return _mm_test_all_ones(_mm_loadu_si128((__m128i*)ptr));
587}
588
589
590/*!
591 @brief check if wave of pointers is all NULL
592 @ingroup SSE4
593*/
595bool sse42_test_all_zero_wave(const void* ptr)
596{
597 __m128i w0 = _mm_loadu_si128((__m128i*)ptr);
598 return _mm_testz_si128(w0, w0);
599}
600
601/*!
602 @brief check if 2 waves of pointers are all NULL
603 @ingroup SSE4
604*/
606bool sse42_test_all_zero_wave2(const void* ptr0, const void* ptr1)
607{
608 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
609 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
610 w0 = _mm_or_si128(w0, w1);
611 return _mm_testz_si128(w0, w0);
612}
613
614/*!
615 @brief check if wave of 2 pointers are the same (null or FULL)
616 @ingroup SSE4
617*/
619bool sse42_test_all_eq_wave2(const void* ptr0, const void* ptr1)
620{
621 __m128i w0 = _mm_loadu_si128((__m128i*)ptr0);
622 __m128i w1 = _mm_loadu_si128((__m128i*)ptr1);
623 w0 = _mm_xor_si128(w0, w1);
624 return _mm_testz_si128(w0, w0);
625}
626
627
628/*!
629 SSE4.2 calculate number of bit changes from 0 to 1
630 @ingroup SSE4
631*/
632inline
633unsigned sse42_bit_block_calc_change(const __m128i* BMRESTRICT block,
634 unsigned size)
635{
636 const __m128i* block_end =
637 ( __m128i*)((bm::word_t*)(block) + size); // bm::set_block_size
638 __m128i m1COshft, m2COshft;
639
640 unsigned w0 = *((bm::word_t*)(block));
641 unsigned count = 1;
642
643 unsigned co2, co1 = 0;
644 for (;block < block_end; block += 2)
645 {
646 __m128i m1A = _mm_load_si128(block);
647 __m128i m2A = _mm_load_si128(block+1);
648
649 __m128i m1CO = _mm_srli_epi32(m1A, 31);
650 __m128i m2CO = _mm_srli_epi32(m2A, 31);
651
652 co2 = _mm_extract_epi32(m1CO, 3);
653
654 __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
655 __m128i m2As = _mm_slli_epi32(m2A, 1);
656
657 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
658 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
659
660 co1 = co2;
661
662 co2 = _mm_extract_epi32(m2CO, 3);
663
664 m2COshft = _mm_slli_si128 (m2CO, 4);
665 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
666
667 m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
668 m2As = _mm_or_si128(m2As, m2COshft);
669
670 co1 = co2;
671
672 // we now have two shifted SSE4 regs with carry-over
673 m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
674 m2A = _mm_xor_si128(m2A, m2As);
675
676#ifdef BM64_SSE4
677 bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
678 bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
679 count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
680
681 m0 = _mm_extract_epi64(m2A, 0);
682 m1 = _mm_extract_epi64(m2A, 1);
683 count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
684#else
685 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
686 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
687 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
688 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
689 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
690 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
691
692 m0 = _mm_extract_epi32(m2A, 0);
693 m1 = _mm_extract_epi32(m2A, 1);
694 m2 = _mm_extract_epi32(m2A, 2);
695 m3 = _mm_extract_epi32(m2A, 3);
696 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
697 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
698#endif
699
700 }
701 count -= (w0 & 1u); // correct initial carry-in error
702 return count;
703}
704
705
706/*!
707 SSE4.2 calculate number of bit changes from 0 to 1 of a XOR product
708 @ingroup SSE4
709*/
710inline
711unsigned sse42_bit_block_calc_xor_change(const __m128i* BMRESTRICT block,
712 const __m128i* BMRESTRICT xor_block,
713 unsigned size)
714{
715 const __m128i* block_end =
716 ( __m128i*)((bm::word_t*)(block) + size);
717 __m128i m1COshft, m2COshft;
718
719 unsigned w0 = *((bm::word_t*)(block));
720 unsigned count = 1;
721
722 unsigned co2, co1 = 0;
723 for (;block < block_end; block += 2, xor_block += 2)
724 {
725 __m128i m1A = _mm_load_si128(block);
726 __m128i m2A = _mm_load_si128(block+1);
727 __m128i m1B = _mm_load_si128(xor_block);
728 __m128i m2B = _mm_load_si128(xor_block+1);
729
730 m1A = _mm_xor_si128(m1A, m1B);
731 m2A = _mm_xor_si128(m2A, m2B);
732
733 __m128i m1CO = _mm_srli_epi32(m1A, 31);
734 __m128i m2CO = _mm_srli_epi32(m2A, 31);
735
736 co2 = _mm_extract_epi32(m1CO, 3);
737
738 __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
739 __m128i m2As = _mm_slli_epi32(m2A, 1);
740
741 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
742 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
743
744 co1 = co2;
745
746 co2 = _mm_extract_epi32(m2CO, 3);
747
748 m2COshft = _mm_slli_si128 (m2CO, 4);
749 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
750
751 m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
752 m2As = _mm_or_si128(m2As, m2COshft);
753
754 co1 = co2;
755
756 // we now have two shifted SSE4 regs with carry-over
757 m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
758 m2A = _mm_xor_si128(m2A, m2As);
759
760#ifdef BM64_SSE4
761 bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
762 bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
763 count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
764
765 m0 = _mm_extract_epi64(m2A, 0);
766 m1 = _mm_extract_epi64(m2A, 1);
767 count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
768#else
769 bm::id_t m0 = _mm_extract_epi32(m1A, 0);
770 bm::id_t m1 = _mm_extract_epi32(m1A, 1);
771 bm::id_t m2 = _mm_extract_epi32(m1A, 2);
772 bm::id_t m3 = _mm_extract_epi32(m1A, 3);
773 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
774 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
775
776 m0 = _mm_extract_epi32(m2A, 0);
777 m1 = _mm_extract_epi32(m2A, 1);
778 m2 = _mm_extract_epi32(m2A, 2);
779 m3 = _mm_extract_epi32(m2A, 3);
780 count += unsigned(_mm_popcnt_u32(m0) + _mm_popcnt_u32(m1) +
781 _mm_popcnt_u32(m2) + _mm_popcnt_u32(m3));
782#endif
783
784 }
785 count -= (w0 & 1u); // correct initial carry-in error
786 return count;
787}
788
789
790
791#ifdef BM64_SSE4
792
793/*!
794 SSE4.2 calculate number of bit changes from 0 to 1
795 @ingroup SSE4
796*/
797inline
799 unsigned* gc, unsigned* bc)
800{
801 const __m128i* block_end =
802 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
803 __m128i m1COshft, m2COshft;
804
805 unsigned w0 = *((bm::word_t*)(block));
806 unsigned bit_count = 0;
807 unsigned gap_count = 1;
808
809 unsigned co2, co1 = 0;
810 for (;block < block_end; block += 2)
811 {
812 __m128i m1A = _mm_load_si128(block);
813 __m128i m2A = _mm_load_si128(block+1);
814 {
815 bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
816 bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
817 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
818 m0 = _mm_extract_epi64(m2A, 0);
819 m1 = _mm_extract_epi64(m2A, 1);
820 bit_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
821 }
822
823 __m128i m1CO = _mm_srli_epi32(m1A, 31);
824 __m128i m2CO = _mm_srli_epi32(m2A, 31);
825
826 co2 = _mm_extract_epi32(m1CO, 3);
827
828 __m128i m1As = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
829 __m128i m2As = _mm_slli_epi32(m2A, 1);
830
831 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
832 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
833
834 co1 = co2;
835
836 co2 = _mm_extract_epi32(m2CO, 3);
837
838 m2COshft = _mm_slli_si128 (m2CO, 4);
839 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
840
841 m1As = _mm_or_si128(m1As, m1COshft); // block[i] |= co_flag
842 m2As = _mm_or_si128(m2As, m2COshft);
843
844 co1 = co2;
845
846 // we now have two shifted SSE4 regs with carry-over
847 m1A = _mm_xor_si128(m1A, m1As); // w ^= (w >> 1);
848 m2A = _mm_xor_si128(m2A, m2As);
849 {
850 bm::id64_t m0 = _mm_extract_epi64(m1A, 0);
851 bm::id64_t m1 = _mm_extract_epi64(m1A, 1);
852 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
853 }
854
855 bm::id64_t m0 = _mm_extract_epi64(m2A, 0);
856 bm::id64_t m1 = _mm_extract_epi64(m2A, 1);
857 gap_count += unsigned(_mm_popcnt_u64(m0) + _mm_popcnt_u64(m1));
858
859 }
860 gap_count -= (w0 & 1u); // correct initial carry-in error
861 *gc = gap_count;
862 *bc = bit_count;
863}
864
865#endif
866
867
868/*!
869 \brief Find first bit which is different between two bit-blocks
870 @ingroup AVX2
871*/
872inline
873bool sse42_bit_find_first_diff(const __m128i* BMRESTRICT block1,
874 const __m128i* BMRESTRICT block2,
875 unsigned* pos)
876{
877 unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
878
879 const __m128i* block1_end =
880 (const __m128i*)((bm::word_t*)(block1) + bm::set_block_size);
881 __m128i maskZ = _mm_setzero_si128();
882 __m128i mA, mB;
883 unsigned simd_lane = 0;
884 do
885 {
886 mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
887 mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
888 __m128i mOR = _mm_or_si128(mA, mB);
889 if (!_mm_test_all_zeros(mOR, mOR)) // test 2x128 lanes
890 {
891 if (!_mm_test_all_zeros(mA, mA))
892 {
893 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
894 mask = ~mask; // invert to find (w != 0)
895 BM_ASSERT(mask);
896 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
897 _mm_store_si128 ((__m128i*)simd_buf, mA);
898 unsigned widx = bsf >> 2; // (bsf / 4);
899 unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mA, widx);
900 bsf = bm::bsf_asm32(w); // find first bit != 0
901 *pos = (simd_lane * 128) + (widx * 32) + bsf;
902 return true;
903 }
904 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
905 mask = ~mask; // invert to find (w != 0)
906 BM_ASSERT(mask);
907 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
908 _mm_store_si128 ((__m128i*)simd_buf, mB);
909 unsigned widx = bsf >> 2; // (bsf / 4);
910 unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mB, widx);
911 bsf = bm::bsf_asm32(w); // find first bit != 0
912 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
913 return true;
914 }
915
916 simd_lane+=2;
917 block1+=2; block2+=2;
918
919 } while (block1 < block1_end);
920 return false;
921}
922
923
924/*!
925 \brief Find first non-zero bit
926 @ingroup AVX2
927*/
928inline
929bool sse42_bit_find_first(const __m128i* BMRESTRICT block,
930 unsigned* pos)
931{
932 unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
933
934 const __m128i* block_end =
935 (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
936 __m128i maskZ = _mm_setzero_si128();
937 __m128i mA, mB;
938 unsigned simd_lane = 0;
939 do
940 {
941 mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
942 __m128i mOR = _mm_or_si128(mA, mB);
943 if (!_mm_test_all_zeros(mOR, mOR)) // test 2x128 lanes
944 {
945 if (!_mm_test_all_zeros(mA, mA))
946 {
947 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
948 mask = ~mask; // invert to find (w != 0)
949 BM_ASSERT(mask);
950 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
951 _mm_store_si128 ((__m128i*)simd_buf, mA);
952 unsigned widx = bsf >> 2; // (bsf / 4);
953 unsigned w = simd_buf[widx];
954 bsf = bm::bsf_asm32(w); // find first bit != 0
955 *pos = (simd_lane * 128) + (widx * 32) + bsf;
956 return true;
957 }
958 unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
959 mask = ~mask; // invert to find (w != 0)
960 BM_ASSERT(mask);
961 int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
962 _mm_store_si128 ((__m128i*)simd_buf, mB);
963 unsigned widx = bsf >> 2; // (bsf / 4);
964 unsigned w = simd_buf[widx];
965 bsf = bm::bsf_asm32(w); // find first bit != 0
966 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
967 return true;
968 }
969
970 simd_lane+=2;
971 block+=2;
972
973 } while (block < block_end);
974 return false;
975}
976
977
978
979
980#ifdef __GNUG__
981// necessary measure to silence false warning from GCC about negative pointer arithmetics
982#pragma GCC diagnostic push
983#pragma GCC diagnostic ignored "-Warray-bounds"
984#endif
985
986/*!
987 SSE4.2 check for one to two (variable len) 128 bit SSE lines
988 for gap search results (8 elements)
989 @ingroup SSE4
990 \internal
991*/
992inline
994 const bm::gap_word_t pos, const unsigned size)
995{
996 BM_ASSERT(size <= 16);
997 BM_ASSERT(size);
998
999 const unsigned unroll_factor = 8;
1000 if (size < 4) // for very short vector use conventional scan
1001 {
1002 unsigned j;
1003 for (j = 0; j < size; ++j)
1004 {
1005 if (pbuf[j] >= pos)
1006 break;
1007 }
1008 return j;
1009 }
1010
1011 __m128i m1, mz, maskF, maskFL;
1012
1013 mz = _mm_setzero_si128();
1014 m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
1015
1016 maskF = _mm_cmpeq_epi64(mz, mz); // set all FF
1017 maskFL = _mm_slli_si128(maskF, 4 * 2); // byte shift to make [0000 FFFF]
1018 int shiftL= (64 - (unroll_factor - size) * 16);
1019 maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
1020
1021 m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
1022 m1 = _mm_or_si128(m1, maskFL);
1023
1024 __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
1025 __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
1026 __m128i c_mask = _mm_slli_epi16(mge_mask, 15); // clear not needed flag bits by shift
1027 int mi = _mm_movemask_epi8(c_mask); // collect flag bits
1028 if (mi)
1029 {
1030 // alternative: int bsr_i= bm::bit_scan_fwd(mi) >> 1;
1031 unsigned bc = _mm_popcnt_u32(mi); // gives us number of elements >= pos
1032 return unroll_factor - bc; // address of first one element (target)
1033 }
1034 // inspect the next lane with possible step back (to avoid over-read the block boundaries)
1035 // GCC gives a false warning for "- unroll_factor" here
1036 const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
1037 BM_ASSERT(pbuf2 > pbuf || size == 8); // assert in place to make sure GCC warning is indeed false
1038
1039 m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
1040 mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
1041 mi = _mm_movemask_epi8(_mm_slli_epi16(mge_mask, 15));
1042 unsigned bc = _mm_popcnt_u32(mi);
1043
1044 return size - bc;
1045}
1046
1047/**
1048 Hybrid binary search, starts as binary, then switches to linear scan
1049
1050 \param buf - GAP buffer pointer.
1051 \param pos - index of the element.
1052 \param is_set - output. GAP value (0 or 1).
1053 \return GAP index.
1054
1055 @ingroup SSE4
1056*/
1057inline
1058unsigned sse42_gap_bfind(const unsigned short* BMRESTRICT buf,
1059 unsigned pos, unsigned* BMRESTRICT is_set)
1060{
1061 unsigned start = 1;
1062 unsigned end = 1 + ((*buf) >> 3);
1063 unsigned dsize = end - start;
1064
1065 if (dsize < 17)
1066 {
1067 start = bm::sse4_gap_find(buf+1, (bm::gap_word_t)pos, dsize);
1068 *is_set = ((*buf) & 1) ^ (start & 1);
1069 BM_ASSERT(buf[start+1] >= pos);
1070 BM_ASSERT(buf[start] < pos || (start==0));
1071
1072 return start+1;
1073 }
1074 unsigned arr_end = end;
1075 while (start != end)
1076 {
1077 unsigned curr = (start + end) >> 1;
1078 if (buf[curr] < pos)
1079 start = curr + 1;
1080 else
1081 end = curr;
1082
1083 unsigned size = end - start;
1084 if (size < 16)
1085 {
1086 size += (end != arr_end);
1087 unsigned idx =
1088 bm::sse4_gap_find(buf + start, (bm::gap_word_t)pos, size);
1089 start += idx;
1090
1091 BM_ASSERT(buf[start] >= pos);
1092 BM_ASSERT(buf[start - 1] < pos || (start == 1));
1093 break;
1094 }
1095 }
1096
1097 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1098 return start;
1099}
1100
1101/**
1102 Hybrid binary search, starts as binary, then switches to scan
1103 @ingroup SSE4
1104*/
1105inline
1106unsigned sse42_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos)
1107{
1108 unsigned is_set;
1109 bm::sse42_gap_bfind(buf, pos, &is_set);
1110 return is_set;
1111}
1112
1113
1114
1115/**
1116 Experimental (test) function to do SIMD vector search (lower bound)
1117 in sorted, growing array
1118 @ingroup SSE4
1119
1120 \internal
1121*/
1122inline
1123int sse42_cmpge_u32(__m128i vect4, unsigned value)
1124{
1125 // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
1126 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
1127 //
1128 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1129 __m128i mm_val = _mm_set1_epi32(value);
1130
1131 __m128i norm_vect4 = _mm_sub_epi32(vect4, mask0x8); // (signed) vect4 - 0x80000000
1132 __m128i norm_val = _mm_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
1133
1134 __m128i cmp_mask_gt = _mm_cmpgt_epi32 (norm_vect4, norm_val);
1135 __m128i cmp_mask_eq = _mm_cmpeq_epi32 (mm_val, vect4);
1136
1137 __m128i cmp_mask_ge = _mm_or_si128 (cmp_mask_gt, cmp_mask_eq);
1138 int mask = _mm_movemask_epi8(cmp_mask_ge);
1139 if (mask)
1140 {
1141 int bsf = bm::bsf_asm32(mask);//_bit_scan_forward(mask); // could use lzcnt()
1142 return bsf / 4;
1143 }
1144 return -1;
1145}
1146
1147
1148/**
1149 lower bound (great or equal) linear scan in ascending order sorted array
1150 @ingroup SSE4
1151 \internal
1152*/
1153inline
1154unsigned sse4_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
1155 unsigned target,
1156 unsigned from,
1157 unsigned to)
1158{
1159 // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
1160 // see more at:
1161 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
1162
1163 const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
1164
1165 unsigned unroll_factor = 8;
1166 unsigned len = to - from + 1;
1167 unsigned len_unr = len - (len % unroll_factor);
1168
1169 __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1170 __m128i vect_target = _mm_set1_epi32(target);
1171 __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
1172
1173 int mask;
1174 __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
1175
1176 unsigned k = 0;
1177 for (; k < len_unr; k+=unroll_factor)
1178 {
1179 vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k])); // 4 u32s
1180 norm_vect40 = _mm_sub_epi32(vect40, mask0x8); // (signed) vect4 - 0x80000000
1181
1182 cmp_mask_ge = _mm_or_si128( // GT | EQ
1183 _mm_cmpgt_epi32 (norm_vect40, norm_target),
1184 _mm_cmpeq_epi32 (vect40, vect_target)
1185 );
1186 mask = _mm_movemask_epi8(cmp_mask_ge);
1187 if (mask)
1188 {
1189 int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
1190 return from + k + (bsf / 4);
1191 }
1192 vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
1193 norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
1194
1195 cmp_mask_ge = _mm_or_si128(
1196 _mm_cmpgt_epi32 (norm_vect41, norm_target),
1197 _mm_cmpeq_epi32 (vect41, vect_target)
1198 );
1199 mask = _mm_movemask_epi8(cmp_mask_ge);
1200 if (mask)
1201 {
1202 int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
1203 return 4 + from + k + (bsf / 4);
1204 }
1205 } // for
1206
1207 for (; k < len; ++k)
1208 {
1209 if (arr_base[k] >= target)
1210 return from + k;
1211 }
1212 return to + 1;
1213}
1214
1215
1216
1217/*!
1218 SSE4.2 index lookup to check what belongs to the same block (8 elements)
1219 \internal
1220*/
1221inline
1222unsigned sse42_idx_arr_block_lookup(const unsigned* idx, unsigned size,
1223 unsigned nb, unsigned start)
1224{
1225 const unsigned unroll_factor = 8;
1226 const unsigned len = (size - start);
1227 const unsigned len_unr = len - (len % unroll_factor);
1228 unsigned k;
1229
1230 idx += start;
1231
1232 __m128i nbM = _mm_set1_epi32(nb);
1233
1234 for (k = 0; k < len_unr; k+=unroll_factor)
1235 {
1236 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1237 __m128i idxB = _mm_loadu_si128((__m128i*)(idx+k+4));
1238 __m128i nbA = _mm_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
1239 __m128i nbB = _mm_srli_epi32(idxB, bm::set_block_shift);
1240
1241 if (!_mm_test_all_ones(_mm_cmpeq_epi32(nbM, nbA)) |
1242 !_mm_test_all_ones(_mm_cmpeq_epi32 (nbM, nbB)))
1243 break;
1244
1245 } // for k
1246 for (; k < len; ++k)
1247 {
1248 if (nb != unsigned(idx[k] >> bm::set_block_shift))
1249 break;
1250 }
1251 return start + k;
1252}
1253
1254/*!
1255 SSE4.2 bulk bit set
1256 \internal
1257*/
1258inline
1260 const unsigned* BMRESTRICT idx,
1261 unsigned start, unsigned stop )
1262{
1263 const unsigned unroll_factor = 4;
1264 const unsigned len = (stop - start);
1265 const unsigned len_unr = len - (len % unroll_factor);
1266
1267 idx += start;
1268
1269 unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
1270 unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
1271
1272 __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
1273 __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
1274
1275 unsigned k = 0;
1276 for (; k < len_unr; k+=unroll_factor)
1277 {
1278 __m128i idxA = _mm_loadu_si128((__m128i*)(idx+k));
1279 __m128i nbitA = _mm_and_si128 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
1280 __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
1281
1282
1283 nbitA = _mm_and_si128 (nbitA, sw_mask);
1284 _mm_store_si128 ((__m128i*)mshift_v, nbitA);
1285
1286 // check-compare if all 4 bits are in the very same word
1287 //
1288 __m128i nwordA_0 = _mm_shuffle_epi32(nwordA, 0x0); // copy element 0
1289 __m128i cmpA = _mm_cmpeq_epi32(nwordA_0, nwordA); // compare EQ
1290 if (_mm_test_all_ones(cmpA)) // check if all are in one word
1291 {
1292 unsigned nword = _mm_extract_epi32(nwordA, 0);
1293 block[nword] |= (1u << mshift_v[0]) | (1u << mshift_v[1])
1294 |(1u << mshift_v[2]) | (1u << mshift_v[3]);
1295 }
1296 else // bits are in different words, use scalar scatter
1297 {
1298 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1299
1300 block[mword_v[0]] |= (1u << mshift_v[0]);
1301 block[mword_v[1]] |= (1u << mshift_v[1]);
1302 block[mword_v[2]] |= (1u << mshift_v[2]);
1303 block[mword_v[3]] |= (1u << mshift_v[3]);
1304 }
1305
1306 } // for k
1307
1308 for (; k < len; ++k)
1309 {
1310 unsigned n = idx[k];
1311 unsigned nbit = unsigned(n & bm::set_block_mask);
1312 unsigned nword = nbit >> bm::set_word_shift;
1313 nbit &= bm::set_word_mask;
1314 block[nword] |= (1u << nbit);
1315 } // for k
1316}
1317
1318
1319/*!
1320 SSE4.2 bit block gather-scatter
1321
1322 @param arr - destination array to set bits
1323 @param blk - source bit-block
1324 @param idx - gather index array
1325 @param size - gather array size
1326 @param start - gaher start index
1327 @param bit_idx - bit to set in the target array
1328
1329 \internal
1330
1331 C algorithm:
1332
1333 for (unsigned k = start; k < size; ++k)
1334 {
1335 nbit = unsigned(idx[k] & bm::set_block_mask);
1336 nword = unsigned(nbit >> bm::set_word_shift);
1337 mask0 = 1u << (nbit & bm::set_word_mask);
1338 arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
1339 }
1340
1341*/
1342inline
1344 const unsigned* BMRESTRICT blk,
1345 const unsigned* BMRESTRICT idx,
1346 unsigned size,
1347 unsigned start,
1348 unsigned bit_idx)
1349{
1350 const unsigned unroll_factor = 4;
1351 const unsigned len = (size - start);
1352 const unsigned len_unr = len - (len % unroll_factor);
1353
1354 __m128i sb_mask = _mm_set1_epi32(bm::set_block_mask);
1355 __m128i sw_mask = _mm_set1_epi32(bm::set_word_mask);
1356 __m128i maskFF = _mm_set1_epi32(~0u);
1357 __m128i maskZ = _mm_xor_si128(maskFF, maskFF);
1358
1359 __m128i mask_tmp, mask_0;
1360
1361 unsigned BM_ALIGN16 mshift_v[4] BM_ALIGN16ATTR;
1362 unsigned BM_ALIGN16 mword_v[4] BM_ALIGN16ATTR;
1363
1364 unsigned k = 0;
1365 unsigned base = start + k;
1366 __m128i* idx_ptr = (__m128i*)(idx + base); // idx[base]
1367 __m128i* target_ptr = (__m128i*)(arr + base); // arr[base]
1368 for (; k < len_unr; k+=unroll_factor)
1369 {
1370 __m128i nbitA = _mm_and_si128 (_mm_loadu_si128(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
1371 __m128i nwordA = _mm_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
1372 // (nbit & bm::set_word_mask)
1373 _mm_store_si128 ((__m128i*)mshift_v, _mm_and_si128 (nbitA, sw_mask));
1374 _mm_store_si128 ((__m128i*)mword_v, nwordA);
1375
1376 // mask0 = 1u << (nbit & bm::set_word_mask);
1377 //
1378#if 0
1379 // ifdefed an alternative SHIFT implementation using SSE and masks
1380 // (it is not faster than just doing scalar operations)
1381 {
1382 __m128i am_0 = _mm_set_epi32(0, 0, 0, ~0u);
1383 __m128i mask1 = _mm_srli_epi32 (maskFF, 31);
1384 mask_0 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[0]), am_0);
1385 mask_tmp = _mm_and_si128 (_mm_slli_epi32(mask1, mshift_v[1]), _mm_slli_si128 (am_0, 4));
1386 mask_0 = _mm_or_si128 (mask_0, mask_tmp);
1387
1388 __m128i mask_2 = _mm_and_si128 (_mm_slli_epi32 (mask1, mshift_v[2]),
1389 _mm_slli_si128 (am_0, 8));
1390 mask_tmp = _mm_and_si128 (
1391 _mm_slli_epi32(mask1, mshift_v[3]),
1392 _mm_slli_si128 (am_0, 12)
1393 );
1394
1395 mask_0 = _mm_or_si128 (mask_0,
1396 _mm_or_si128 (mask_2, mask_tmp)); // assemble bit-test mask
1397 }
1398#endif
1399 mask_0 = _mm_set_epi32(1 << mshift_v[3], 1 << mshift_v[2], 1 << mshift_v[1], 1 << mshift_v[0]);
1400
1401
1402 // gather for: blk[nword] (.. & mask0 )
1403 //
1404 mask_tmp = _mm_and_si128(_mm_set_epi32(blk[mword_v[3]], blk[mword_v[2]],
1405 blk[mword_v[1]], blk[mword_v[0]]),
1406 mask_0);
1407
1408 // bool(blk[nword] ...)
1409 //maskFF = _mm_set1_epi32(~0u);
1410 mask_tmp = _mm_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF where == 0
1411 mask_tmp = _mm_xor_si128 (mask_tmp, maskFF); // invert
1412 mask_tmp = _mm_srli_epi32 (mask_tmp, 31); // (bool) 1 only to the 0 pos
1413
1414 mask_tmp = _mm_slli_epi32(mask_tmp, bit_idx); // << bit_idx
1415
1416 _mm_storeu_si128 (target_ptr, // arr[base] |= MASK_EXPR
1417 _mm_or_si128 (mask_tmp, _mm_loadu_si128(target_ptr)));
1418
1419 ++idx_ptr; ++target_ptr;
1420 _mm_prefetch((const char*)target_ptr, _MM_HINT_T0);
1421 }
1422
1423 for (; k < len; ++k)
1424 {
1425 base = start + k;
1426 unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
1427 arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
1428 }
1429
1430}
1431
1432/*!
1433 @brief block shift left by 1
1434 @ingroup SSE4
1435*/
1436inline
1437bool sse42_shift_l1(__m128i* block, unsigned* empty_acc, unsigned co1)
1438{
1439 __m128i* block_end =
1440 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1441 __m128i mAcc = _mm_set1_epi32(0);
1442 __m128i mMask1 = _mm_set1_epi32(1);
1443
1444 unsigned co2;
1445 for (--block_end; block_end >= block; block_end -= 2)
1446 {
1447 __m128i m1A = _mm_load_si128(block_end);
1448 __m128i m2A = _mm_load_si128(block_end-1);
1449
1450 __m128i m1CO = _mm_and_si128(m1A, mMask1);
1451 __m128i m2CO = _mm_and_si128(m2A, mMask1);
1452
1453 co2 = _mm_extract_epi32(m1CO, 0);
1454
1455 m1A = _mm_srli_epi32(m1A, 1); // (block[i] >> 1u)
1456 m2A = _mm_srli_epi32(m2A, 1);
1457
1458 __m128i m1COshft = _mm_srli_si128 (m1CO, 4); // byte shift-r by 1 int32
1459 __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1460 m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1461 m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1462 m1COshft = _mm_slli_epi32(m1COshft, 31);
1463 m2COshft = _mm_slli_epi32(m2COshft, 31);
1464
1465 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1466 m2A = _mm_or_si128(m2A, m2COshft);
1467
1468 co1 = _mm_extract_epi32(m2CO, 0);
1469
1470 _mm_store_si128(block_end, m1A);
1471 _mm_store_si128(block_end-1, m2A);
1472
1473 mAcc = _mm_or_si128(mAcc, m1A);
1474 mAcc = _mm_or_si128(mAcc, m2A);
1475 } // for
1476
1477 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1478 return co1;
1479}
1480
1481
1482/*!
1483 @brief block shift right by 1
1484 @ingroup SSE4
1485*/
1486inline
1487bool sse42_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1)
1488{
1489 __m128i* block_end =
1490 ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1491 __m128i m1COshft, m2COshft;
1492 __m128i mAcc = _mm_set1_epi32(0);
1493
1494 unsigned co2;
1495 for (;block < block_end; block += 2)
1496 {
1497 __m128i m1A = _mm_load_si128(block);
1498 __m128i m2A = _mm_load_si128(block+1);
1499
1500 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1501 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1502
1503 co2 = _mm_extract_epi32(m1CO, 3);
1504
1505 m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1506 m2A = _mm_slli_epi32(m2A, 1);
1507
1508 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift-l by 1 int32
1509 m2COshft = _mm_slli_si128 (m2CO, 4);
1510 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1511 m2COshft = _mm_insert_epi32 (m2COshft, co2, 0);
1512
1513 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1514 m2A = _mm_or_si128(m2A, m2COshft);
1515
1516 co1 = _mm_extract_epi32(m2CO, 3);
1517
1518 _mm_store_si128(block, m1A);
1519 _mm_store_si128(block+1, m2A);
1520
1521 mAcc = _mm_or_si128(mAcc, m1A);
1522 mAcc = _mm_or_si128(mAcc, m2A);
1523 }
1524 *empty_acc = !_mm_testz_si128(mAcc, mAcc);
1525 return co1;
1526}
1527
1528
1529
1530/*!
1531 @brief block shift right by 1 plus AND
1532
1533 @return carry over flag
1534 @ingroup SSE4
1535*/
1536inline
1537bool sse42_shift_r1_and(__m128i* block,
1538 bm::word_t co1,
1539 const __m128i* BMRESTRICT mask_block,
1540 bm::id64_t* digest)
1541{
1542 bm::word_t* wblock = (bm::word_t*) block;
1543 const bm::word_t* mblock = (const bm::word_t*) mask_block;
1544
1545 __m128i m1COshft, m2COshft;
1546 __m128i mAcc = _mm_set1_epi32(0);
1547 unsigned co2;
1548
1549 bm::id64_t d, wd;
1550 wd = d = *digest;
1551
1552 unsigned di = 0;
1553 if (!co1)
1554 {
1555 bm::id64_t t = d & -d;
1556#ifdef BM64_SSE4
1557 di = unsigned(_mm_popcnt_u64(t - 1)); // find start bit-index
1558#else
1559 bm::id_t t32 = t & bm::id_max;
1560 if (t32 == 0) {
1561 di = 32;
1562 t32 = t >> 32;
1563 }
1564 di += unsigned(_mm_popcnt_u32(t32 - 1));
1565#endif
1566 }
1567
1568 for (; di < 64 ; ++di)
1569 {
1570 const unsigned d_base = di * bm::set_block_digest_wave_size;
1571 bm::id64_t dmask = (1ull << di);
1572 if (d & dmask) // digest stride NOT empty
1573 {
1574 block = (__m128i*) &wblock[d_base];
1575 mask_block = (__m128i*) &mblock[d_base];
1576 mAcc = _mm_xor_si128(mAcc, mAcc); // mAcc = 0
1577 for (unsigned i = 0; i < 4; ++i, block += 2, mask_block += 2)
1578 {
1579 __m128i m1A = _mm_load_si128(block);
1580 __m128i m2A = _mm_load_si128(block+1);
1581
1582 __m128i m1CO = _mm_srli_epi32(m1A, 31);
1583 __m128i m2CO = _mm_srli_epi32(m2A, 31);
1584
1585 co2 = _mm_extract_epi32(m1CO, 3);
1586
1587 m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1588 m2A = _mm_slli_epi32(m2A, 1);
1589
1590 m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift left by 1 int32
1591 m1COshft = _mm_insert_epi32 (m1COshft, co1, 0);
1592
1593 co1 = co2;
1594
1595 co2 = _mm_extract_epi32(m2CO, 3);
1596
1597 m2COshft = _mm_slli_si128 (m2CO, 4);
1598 m2COshft = _mm_insert_epi32 (m2COshft, co1, 0);
1599
1600 m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1601 m2A = _mm_or_si128(m2A, m2COshft);
1602
1603 m1A = _mm_and_si128(m1A, _mm_load_si128(mask_block)); // block[i] &= mask_block[i]
1604 m2A = _mm_and_si128(m2A, _mm_load_si128(mask_block+1)); // block[i] &= mask_block[i]
1605
1606 mAcc = _mm_or_si128(mAcc, m1A);
1607 mAcc = _mm_or_si128(mAcc, m2A);
1608
1609 _mm_store_si128(block, m1A);
1610 _mm_store_si128(block+1, m2A);
1611
1612 co1 = co2;
1613
1614 } // for i
1615
1616 if (_mm_testz_si128(mAcc, mAcc))
1617 d &= ~dmask; // clear digest bit
1618 wd &= wd - 1;
1619 }
1620 else
1621 {
1622 if (co1)
1623 {
1624 BM_ASSERT(co1 == 1);
1625 BM_ASSERT(wblock[d_base] == 0);
1626
1627 bm::id64_t w0 = wblock[d_base] = co1 & mblock[d_base];
1628 d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
1629 co1 = 0;
1630 }
1631 if (!wd) // digest is empty, no CO -> exit
1632 break;
1633 }
1634 } // for di
1635
1636 *digest = d;
1637 return co1;
1638}
1639
1640/**
1641 Build partial XOR product of 2 bit-blocks using digest mask
1642
1643 @param target_block - target := block ^ xor_block
1644 @param block - arg1
1645 @param xor_block - arg2
1646 @param digest - mask for each block wave to XOR (1) or just copy (0)
1647
1648 @ingroup SSE4
1649 @internal
1650*/
1651inline
1653 const bm::word_t* block, const bm::word_t* xor_block,
1654 bm::id64_t digest)
1655{
1656 for (unsigned i = 0; i < bm::block_waves; ++i)
1657 {
1658 const bm::id64_t mask = (1ull << i);
1659 unsigned off = (i * bm::set_block_digest_wave_size);
1660 const __m128i* sub_block = (__m128i*) (block + off);
1661 __m128i* t_sub_block = (__m128i*)(target_block + off);
1662
1663 if (digest & mask) // XOR filtered sub-block
1664 {
1665 const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
1666 __m128i mA, mB, mC, mD;
1667 mA = _mm_xor_si128(_mm_load_si128(sub_block),
1668 _mm_load_si128(xor_sub_block));
1669 mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
1670 _mm_load_si128(xor_sub_block+1));
1671 mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
1672 _mm_load_si128(xor_sub_block+2));
1673 mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
1674 _mm_load_si128(xor_sub_block+3));
1675
1676 _mm_store_si128(t_sub_block, mA);
1677 _mm_store_si128(t_sub_block+1, mB);
1678 _mm_store_si128(t_sub_block+2, mC);
1679 _mm_store_si128(t_sub_block+3, mD);
1680
1681 mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
1682 _mm_load_si128(xor_sub_block+4));
1683 mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
1684 _mm_load_si128(xor_sub_block+5));
1685 mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
1686 _mm_load_si128(xor_sub_block+6));
1687 mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
1688 _mm_load_si128(xor_sub_block+7));
1689
1690 _mm_store_si128(t_sub_block+4, mA);
1691 _mm_store_si128(t_sub_block+5, mB);
1692 _mm_store_si128(t_sub_block+6, mC);
1693 _mm_store_si128(t_sub_block+7, mD);
1694
1695 }
1696 else // just copy source
1697 {
1698 _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
1699 _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
1700 _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
1701 _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
1702
1703 _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
1704 _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
1705 _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
1706 _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
1707 }
1708 } // for i
1709}
1710
1711
1712
1713#define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1714 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1715
1716#define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1717 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1718
1719#define VECT_BITCOUNT(first, last) \
1720 sse4_bit_count((__m128i*) (first), (__m128i*) (last))
1721
1722#define VECT_BITCOUNT_AND(first, last, mask) \
1723 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1724
1725#define VECT_BITCOUNT_OR(first, last, mask) \
1726 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1727
1728#define VECT_BITCOUNT_XOR(first, last, mask) \
1729 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1730
1731#define VECT_BITCOUNT_SUB(first, last, mask) \
1732 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1733
1734#define VECT_INVERT_BLOCK(first) \
1735 sse2_invert_block((__m128i*)first);
1736
1737#define VECT_AND_BLOCK(dst, src) \
1738 sse4_and_block((__m128i*) dst, (__m128i*) (src))
1739
1740#define VECT_AND_DIGEST(dst, src) \
1741 sse4_and_digest((__m128i*) dst, (const __m128i*) (src))
1742
1743#define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1744 sse4_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1745
1746#define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1747 sse4_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1748
1749#define VECT_OR_BLOCK(dst, src) \
1750 sse2_or_block((__m128i*) dst, (__m128i*) (src))
1751
1752#define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1753 sse2_or_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1754
1755#define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1756 sse2_or_block_3way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1757
1758#define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1759 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1760
1761#define VECT_SUB_BLOCK(dst, src) \
1762 sse2_sub_block((__m128i*) dst, (const __m128i*) (src))
1763
1764#define VECT_SUB_DIGEST(dst, src) \
1765 sse4_sub_digest((__m128i*) dst, (const __m128i*) (src))
1766
1767#define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1768 sse4_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1769
1770#define VECT_XOR_BLOCK(dst, src) \
1771 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1772
1773#define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1774 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1775
1776#define VECT_COPY_BLOCK(dst, src) \
1777 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1778
1779#define VECT_STREAM_BLOCK(dst, src) \
1780 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1781
1782#define VECT_SET_BLOCK(dst, value) \
1783 sse2_set_block((__m128i*) dst, value)
1784
1785#define VECT_IS_ZERO_BLOCK(dst) \
1786 sse4_is_all_zero((__m128i*) dst)
1787
1788#define VECT_IS_ONE_BLOCK(dst) \
1789 sse4_is_all_one((__m128i*) dst)
1790
1791#define VECT_IS_DIGEST_ZERO(start) \
1792 sse4_is_digest_zero((__m128i*)start)
1793
1794#define VECT_BLOCK_SET_DIGEST(dst, val) \
1795 sse4_block_set_digest((__m128i*)dst, val)
1796
1797#define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1798 sse4_lower_bound_scan_u32(arr, target, from, to)
1799
1800#define VECT_SHIFT_L1(b, acc, co) \
1801 sse42_shift_l1((__m128i*)b, acc, co)
1802
1803#define VECT_SHIFT_R1(b, acc, co) \
1804 sse42_shift_r1((__m128i*)b, acc, co)
1805
1806#define VECT_SHIFT_R1_AND(b, co, m, digest) \
1807 sse42_shift_r1_and((__m128i*)b, co, (__m128i*)m, digest)
1808
1809#define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
1810 sse42_idx_arr_block_lookup(idx, size, nb, start)
1811
1812#define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
1813 sse42_set_block_bits(block, idx, start, stop)
1814
1815#define VECT_BLOCK_CHANGE(block, size) \
1816 sse42_bit_block_calc_change((__m128i*)block, size)
1817
1818#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size) \
1819 sse42_bit_block_calc_xor_change((__m128i*)block, (__m128i*)xor_block, size)
1820
1821#ifdef BM64_SSE4
1822#define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
1823 sse42_bit_block_calc_change_bc((__m128i*)block, gc, bc)
1824#endif
1825
1826#define VECT_BIT_FIND_FIRST(src, pos) \
1827 sse42_bit_find_first((__m128i*) src, pos)
1828
1829#define VECT_BIT_FIND_DIFF(src1, src2, pos) \
1830 sse42_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
1831
1832#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
1833 sse42_bit_block_xor(t, src, src_xor, d)
1834
1835#define VECT_GAP_BFIND(buf, pos, is_set) \
1836 sse42_gap_bfind(buf, pos, is_set)
1837
1838#ifdef __GNUG__
1839#pragma GCC diagnostic pop
1840#endif
1841
1842#ifdef _MSC_VER
1843#pragma warning( pop )
1844#endif
1845
1846} // namespace
1847
1848
1849
1850
1851#endif
Definitions(internal)
#define BM_ALIGN16
Definition bmdef.h:276
#define BMRESTRICT
Definition bmdef.h:193
#define BM_ALIGN32
Definition bmdef.h:281
#define BM_ALIGN16ATTR
Definition bmdef.h:277
#define BMFORCEINLINE
Definition bmdef.h:203
#define BM_ASSERT
Definition bmdef.h:130
#define BM_ALIGN32ATTR
Definition bmdef.h:282
Compute functions for SSE SIMD instruction set (internal)
Bit manipulation primitives (internal)
BMFORCEINLINE bool sse42_test_all_one_wave(const void *ptr)
check if SSE wave is all oxFFFF...FFF
Definition bmsse4.h:584
unsigned sse4_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
Definition bmsse4.h:237
bool sse4_is_all_one(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition bmsse4.h:559
unsigned sse42_bit_block_calc_xor_change(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT xor_block, unsigned size)
Definition bmsse4.h:711
unsigned sse42_bit_block_calc_change(const __m128i *BMRESTRICT block, unsigned size)
Definition bmsse4.h:633
bm::id_t sse4_bit_count(const __m128i *block, const __m128i *block_end)
Definition bmsse4.h:78
BMFORCEINLINE bool sse42_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 waves of pointers are all NULL
Definition bmsse4.h:606
bool sse4_and_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND block digest stride dst &= *src.
Definition bmsse4.h:284
unsigned sse4_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
Definition bmsse4.h:1154
unsigned sse42_gap_test(const unsigned short *BMRESTRICT buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
Definition bmsse4.h:1106
bool sse42_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift left by 1
Definition bmsse4.h:1437
unsigned sse42_gap_bfind(const unsigned short *BMRESTRICT buf, unsigned pos, unsigned *BMRESTRICT is_set)
Hybrid binary search, starts as binary, then switches to linear scan.
Definition bmsse4.h:1058
bool sse4_is_digest_zero(const __m128i *BMRESTRICT block)
check if digest stride is all zero bits
Definition bmsse4.h:200
BMFORCEINLINE bool sse42_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
Definition bmsse4.h:595
int sse42_cmpge_u32(__m128i vect4, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
Definition bmsse4.h:1123
bool sse4_and_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
AND block digest stride dst = *src1 & src2.
Definition bmsse4.h:332
bool sse42_shift_r1_and(__m128i *block, bm::word_t co1, const __m128i *BMRESTRICT mask_block, bm::id64_t *digest)
block shift right by 1 plus AND
Definition bmsse4.h:1537
void sse42_bit_block_calc_change_bc(const __m128i *BMRESTRICT block, unsigned *gc, unsigned *bc)
Definition bmsse4.h:798
unsigned sse4_gap_find(const bm::gap_word_t *BMRESTRICT pbuf, const bm::gap_word_t pos, const unsigned size)
Definition bmsse4.h:993
bool sse4_sub_digest(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SUB (AND NOT) block digest stride dst &= ~*src.
Definition bmsse4.h:462
void sse4_block_set_digest(__m128i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
Definition bmsse4.h:219
bool sse42_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1)
block shift right by 1
Definition bmsse4.h:1487
BMFORCEINLINE bool sse42_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if wave of 2 pointers are the same (null or FULL)
Definition bmsse4.h:619
bool sse4_and_digest_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4)
AND block digest stride.
Definition bmsse4.h:379
bool sse4_is_all_zero(const __m128i *BMRESTRICT block)
check if block is all zero bits
Definition bmsse4.h:175
void sse42_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest)
Build partial XOR product of 2 bit-blocks using digest mask.
Definition bmsse4.h:1652
bool sse4_sub_digest_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
Definition bmsse4.h:511
Definition bm.h:77
const unsigned set_block_digest_wave_size
Definition bmconst.h:66
BMFORCEINLINE unsigned op_or(unsigned a, unsigned b)
Definition bmsse4.h:117
bool sse42_bit_find_first(const __m128i *BMRESTRICT block, unsigned *pos)
Find first non-zero bit.
Definition bmsse4.h:929
const unsigned id_max
Definition bmconst.h:108
unsigned int word_t
Definition bmconst.h:38
bool sse42_bit_find_first_diff(const __m128i *BMRESTRICT block1, const __m128i *BMRESTRICT block2, unsigned *pos)
Find first bit which is different between two bit-blocks.
Definition bmsse4.h:873
void sse42_set_block_bits(bm::word_t *BMRESTRICT block, const unsigned *BMRESTRICT idx, unsigned start, unsigned stop)
Definition bmsse4.h:1259
const unsigned set_block_mask
Definition bmconst.h:56
bm::id_t sse4_bit_count_op(const __m128i *BMRESTRICT block, const __m128i *BMRESTRICT block_end, const __m128i *BMRESTRICT mask_block, Func sse2_func)
Definition bmsse4.h:133
BMFORCEINLINE unsigned op_and(unsigned a, unsigned b)
Definition bmsse4.h:126
BMFORCEINLINE unsigned op_xor(unsigned a, unsigned b)
Definition bmsse4.h:107
const unsigned set_word_shift
Definition bmconst.h:71
const unsigned set_block_size
Definition bmconst.h:54
unsigned long long int id64_t
Definition bmconst.h:34
const unsigned block_waves
Definition bmconst.h:65
unsigned int id_t
Definition bmconst.h:37
unsigned sse42_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
Definition bmsse4.h:1222
unsigned short gap_word_t
Definition bmconst.h:77
void sse4_bit_block_gather_scatter(unsigned *BMRESTRICT arr, const unsigned *BMRESTRICT blk, const unsigned *BMRESTRICT idx, unsigned size, unsigned start, unsigned bit_idx)
Definition bmsse4.h:1343
const unsigned set_block_shift
Definition bmconst.h:55
const unsigned set_word_mask
Definition bmconst.h:72