BitMagic-C++
xsample03.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2018 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example xsample03.cpp
20 Seach for human mutation (SNP) in within chr1.
21 Benchmark comaprison of different methods
22
23 \sa bm::sparse_vector
24 \sa bm::rsc_sparse_vector
25 \sa bm::sparse_vector_scanner
26*/
27
28/*! \file xsample03.cpp
29 \brief Example: SNP search in human genome
30
31 Brief description of used method:
32 1. Parse SNP chromosome report and extract information about SNP number and
33 location in the chromosome
34 2. Use this information to build bit-transposed sparse_vector<>
35 where vector position matches chromosome position and SNP ids (aka rsid)
36 is kept as a bit-transposed matrix
37 3. Build rank-select compressed sparse vector, dropping all NULL columns
38 (this data format is pretty sparse, since number of SNPs is significantly
39 less than number of chromosome bases (1:5 or less)
40 Use memory report to understand memory footprint for each form of storage
41 4. Run benchmarks searching for 500 randomly selected SNPs using
42 - bm::sparse_vector<>
43 - bm::rsc_sparse_vector<>
44 - std::vector<pair<unsigned, unsigned> >
45
46 This example should be useful for construction of compressed columnar
47 tables with parallel search capabilities.
48
49*/
50
51#include <iostream>
52#include <sstream>
53#include <chrono>
54#include <regex>
55#include <time.h>
56#include <stdio.h>
57
58
59#include <vector>
60#include <chrono>
61#include <map>
62#include <utility>
63
64#include "bm.h"
65#include "bmalgo.h"
66#include "bmserial.h"
67#include "bmrandom.h"
68#include "bmsparsevec.h"
69#include "bmsparsevec_compr.h"
70#include "bmsparsevec_algo.h"
71#include "bmsparsevec_serial.h"
72#include "bmalgo_similarity.h"
73#include "bmsparsevec_util.h"
74
75
76#include "bmdbg.h"
77#include "bmtimer.h"
78
79static
81{
82 std::cerr
83 << "BitMagic SNP Search Sample Utility (c) 2018" << std::endl
84 << "-isnp file-name -- input set file (SNP FASTA) to parse" << std::endl
85 << "-svout spase vector output -- sparse vector name to save" << std::endl
86 << "-rscout rs-compressed spase vector output -- name to save" << std::endl
87 << "-svin sparse vector input -- sparse vector file name to load " << std::endl
88 << "-rscin rs-compressed sparse vector input -- file name to load " << std::endl
89 << "-diag -- run diagnostics" << std::endl
90 << "-timing -- collect timings" << std::endl
91 ;
92}
93
94
95
96
97// Arguments
98//
99std::string sv_out_name;
100std::string rsc_out_name;
101std::string sv_in_name;
102std::string rsc_in_name;
103std::string isnp_name;
104bool is_diag = false;
105bool is_timing = false;
106bool is_bench = false;
107
108static
109int parse_args(int argc, char *argv[])
110{
111 for (int i = 1; i < argc; ++i)
112 {
113 std::string arg = argv[i];
114 if ((arg == "-h") || (arg == "--help"))
115 {
116 show_help();
117 return 0;
118 }
119
120 if (arg == "-svout" || arg == "--svout")
121 {
122 if (i + 1 < argc)
123 {
124 sv_out_name = argv[++i];
125 }
126 else
127 {
128 std::cerr << "Error: -svout requires file name" << std::endl;
129 return 1;
130 }
131 continue;
132 }
133 if (arg == "-rscout" || arg == "--rscout")
134 {
135 if (i + 1 < argc)
136 {
137 rsc_out_name = argv[++i];
138 }
139 else
140 {
141 std::cerr << "Error: -rscout requires file name" << std::endl;
142 return 1;
143 }
144 continue;
145 }
146
147 if (arg == "-svin" || arg == "--svin")
148 {
149 if (i + 1 < argc)
150 {
151 sv_in_name = argv[++i];
152 }
153 else
154 {
155 std::cerr << "Error: -svin requires file name" << std::endl;
156 return 1;
157 }
158 continue;
159 }
160
161 if (arg == "-rscin" || arg == "--rscin")
162 {
163 if (i + 1 < argc)
164 {
165 rsc_in_name = argv[++i];
166 }
167 else
168 {
169 std::cerr << "Error: -rscin requires file name" << std::endl;
170 return 1;
171 }
172 continue;
173 }
174
175 if (arg == "-isnp" || arg == "--isnp" || arg == "-snp" || arg == "--snp")
176 {
177 if (i + 1 < argc)
178 {
179 isnp_name = argv[++i];
180 }
181 else
182 {
183 std::cerr << "Error: -isnp requires file name" << std::endl;
184 return 1;
185 }
186 continue;
187 }
188
189 if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
190 is_diag = true;
191 if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
192 is_timing = true;
193 if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
194 is_bench = true;
195
196 } // for i
197 return 0;
198}
199
200
201// Global types
202//
205typedef std::vector<std::pair<unsigned, unsigned> > vector_pairs;
206
207// ----------------------------------------------------------------------------
208
210
211// SNP report format parser (extraction and transformation)
212// Parser extracts SNP id (rsid) and positio on genome to build
213// sparse vector where index (position in vector) metches position on the
214// chromosome 1.
215//
216static
217int load_snp_report(const std::string& fname, sparse_vector_u32& sv)
218{
219 bm::chrono_taker tt1("1. parse input SNP chr report", 1, &timing_map);
220
221 std::ifstream fin(fname.c_str(), std::ios::in);
222 if (!fin.good())
223 return -1;
224
225 unsigned rs_id, rs_pos;
226 size_t idx;
227
228 std::string line;
229 std::string delim = " \t";
230
231 std::regex reg("\\s+");
232 std::sregex_token_iterator it_end;
233
234 bm::bvector<> bv_rs;
235 bv_rs.init();
236
237 unsigned rs_cnt = 0;
238 for (unsigned i = 0; std::getline(fin, line); ++i)
239 {
240 if (line.empty() ||
241 !isdigit(line.front())
242 )
243 continue;
244
245 // regex based tokenizer
246 std::sregex_token_iterator it(line.begin(), line.end(), reg, -1);
247 std::vector<std::string> line_vec(it, it_end);
248 if (line_vec.empty())
249 continue;
250
251 // parse columns of interest
252 try
253 {
254 rs_id = unsigned(std::stoul(line_vec.at(0), &idx));
255
256 if (bv_rs.test(rs_id))
257 {
258 continue;
259 }
260 rs_pos = unsigned(std::stoul(line_vec.at(11), &idx));
261
262 bv_rs.set_bit_no_check(rs_id);
263 sv.set(rs_pos, rs_id);
264
265 ++rs_cnt;
266 }
267 catch (std::exception& /*ex*/)
268 {
269 continue; // detailed disgnostics commented out
270 // error detected, because some columns are sometimes missing
271 // just ignore it
272 //
273 /*
274 std::cerr << ex.what() << "; ";
275 std::cerr << "rs=" << line_vec.at(0) << " pos=" << line_vec.at(11) << std::endl;
276 continue;
277 */
278 }
279 if (rs_cnt % (4 * 1024) == 0)
280 std::cout << "\r" << rs_cnt << " / " << i; // PROGRESS report
281 } // for i
282
283 std::cout << std::endl;
284 std::cout << "SNP count=" << rs_cnt << std::endl;
285 return 0;
286}
287
288// Generate random subset of random values from a sparse vector
289//
290static
291void generate_random_subset(const sparse_vector_u32& sv, std::vector<unsigned>& vect, unsigned count)
292{
294
295 bm::random_subset<bm::bvector<> > rand_sampler;
296 bm::bvector<> bv_sample;
297 rand_sampler.sample(bv_sample, *bv_null, count);
298
299 bm::bvector<>::enumerator en = bv_sample.first();
300 for (; en.valid(); ++en)
301 {
302 unsigned idx = *en;
303 unsigned v = sv[idx];
304 vect.push_back(v);
305 }
306}
307
308// build std::vector of pairs (position to rs)
309//
310static
312{
315
316 for (; it != it_end; ++it)
317 {
318 if (!it.is_null())
319 {
320 std::pair<unsigned, unsigned> pos2rs = std::make_pair(it.pos(), it.value());
321 vp.push_back(pos2rs);
322 }
323 }
324}
325
326// O(N) -- O(N/2) linear search in vector of pairs (position - rsid)
327//
328static
329bool search_vector_pairs(const vector_pairs& vp, unsigned rs_id, unsigned& pos)
330{
331 for (unsigned i = 0; i < vp.size(); ++i)
332 {
333 if (vp[i].second == rs_id)
334 {
335 pos = vp[i].first;
336 return true;
337 }
338 }
339 return false;
340}
341
342// SNP search benchmark
343// Search for SNPs using different data structures (Bitmagic and STL)
344//
345// An extra step is verification, to make sure all methods are consistent
346//
347static
349{
350 const unsigned rs_sample_count = 2000;
351
352 std::vector<unsigned> rs_vect;
353 generate_random_subset(sv, rs_vect, rs_sample_count);
354 if (rs_vect.empty())
355 {
356 std::cerr << "Benchmark subset empty!" << std::endl;
357 return;
358 }
359
360 // build traditional sparse vector
361 vector_pairs vp;
362 build_vector_pairs(sv, vp);
363
364 // search result bit-vectors
365 //
366 bm::bvector<> bv_found1;
367 bm::bvector<> bv_found2;
368 bm::bvector<> bv_found3;
369
370 bv_found1.init(); bv_found2.init(); bv_found3.init();// pre-initialize vectors
371
372 if (!sv.empty())
373 {
374 bm::chrono_taker tt1("09. rs search (sv)", unsigned(rs_vect.size()), &timing_map);
375
376 // scanner class
377 // it's important to keep scanner class outside the loop to avoid
378 // unnecessary re-allocs and construction costs
379 //
380
382
383 for (unsigned i = 0; i < rs_vect.size(); ++i)
384 {
385 unsigned rs_id = rs_vect[i];
386 unsigned rs_pos;
387 bool found = scanner.find_eq(sv, rs_id, rs_pos);
388
389 if (found)
390 {
391 bv_found1.set_bit_no_check(rs_pos);
392 }
393 else
394 {
395 std::cout << "Error: rs_id = " << rs_id << " not found!" << std::endl;
396 }
397 } // for
398 }
399
400 if (!csv.empty())
401 {
402 bm::chrono_taker tt1("10. rs search (rsc_sv)", unsigned(rs_vect.size()), &timing_map);
403
405
406 for (unsigned i = 0; i < rs_vect.size(); ++i)
407 {
408 unsigned rs_id = rs_vect[i];
409 unsigned rs_pos;
410 bool found = scanner.find_eq(csv, rs_id, rs_pos);
411
412 if (found)
413 {
414 bv_found2.set_bit_no_check(rs_pos);
415 }
416 else
417 {
418 std::cout << "rs_id = " << rs_id << " not found!" << std::endl;
419 }
420 } // for
421 }
422
423 if (vp.size())
424 {
425 bm::chrono_taker tt1("11. rs search (std::vector<>)", unsigned(rs_vect.size()), &timing_map);
426
427 for (unsigned i = 0; i < rs_vect.size(); ++i)
428 {
429 unsigned rs_id = rs_vect[i];
430 unsigned rs_pos;
431 bool found = search_vector_pairs(vp, rs_id, rs_pos);
432
433 if (found)
434 {
435 bv_found3.set_bit_no_check(rs_pos);
436 }
437 else
438 {
439 std::cout << "rs_id = " << rs_id << " not found!" << std::endl;
440 }
441 } // for
442 }
443
444 // compare results from various methods (check integrity)
445 int res = bv_found1.compare(bv_found2);
446 if (res != 0)
447 {
448 std::cerr << "Error: search discrepancy (sparse search) detected!" << std::endl;
449 }
450 res = bv_found1.compare(bv_found3);
451 if (res != 0)
452 {
453 std::cerr << "Error: search discrepancy (std::vector<>) detected!" << std::endl;
454 }
455
456}
457
458
459int main(int argc, char *argv[])
460{
461 if (argc < 3)
462 {
463 show_help();
464 return 1;
465 }
466
469
470 try
471 {
472 auto ret = parse_args(argc, argv);
473 if (ret != 0)
474 return ret;
475
476 if (!isnp_name.empty())
477 {
478 auto res = load_snp_report(isnp_name, sv);
479 if (res != 0)
480 {
481 return res;
482 }
483 }
484 if (!sv_in_name.empty())
485 {
486 bm::chrono_taker tt1("02. Load sparse vector", 1, &timing_map);
487 file_load_svector(sv, sv_in_name);
488 }
489
490 // load rs-compressed sparse vector
491 if (!rsc_in_name.empty())
492 {
493 bm::chrono_taker tt1("03. Load rsc sparse vector", 1, &timing_map);
494 file_load_svector(csv, rsc_in_name);
495 }
496
497 // if rs-compressed vector is not available - build it on the fly
498 if (csv.empty() && !sv.empty())
499 {
501 {
502 bm::chrono_taker tt1("04. rs compress sparse vector", 1, &timing_map);
503 csv.load_from(sv);
504 }
505 {
506 bm::chrono_taker tt1("05. rs de-compress sparse vector", 1, &timing_map);
507 csv.load_to(sv2);
508 }
509
510 if (!sv.equal(sv2)) // diagnostics check (just in case)
511 {
512 std::cerr << "Error: rs-compressed vector check failed!" << std::endl;
513 return 1;
514 }
515 }
516
517 // save SV vector
518 if (!sv_out_name.empty() && !sv.empty())
519 {
520 bm::chrono_taker tt1("07. Save sparse vector", 1, &timing_map);
521 sv.optimize();
522 file_save_svector(sv, sv_out_name);
523 }
524
525 // save RS sparse vector
526 if (!rsc_out_name.empty() && !csv.empty())
527 {
528 bm::chrono_taker tt1("08. Save RS sparse vector", 1, &timing_map);
529 csv.optimize();
530 file_save_svector(csv, rsc_out_name);
531 }
532
533 if (is_diag) // print memory diagnostics
534 {
535 if (!sv.empty())
536 {
537 std::cout << std::endl
538 << "sparse vector statistics:"
539 << std::endl;
540 bm::print_svector_stat(sv, true);
541 }
542 if (!csv.empty())
543 {
544 std::cout << std::endl
545 << "RS compressed sparse vector statistics:"
546 << std::endl;
547 bm::print_svector_stat(csv, true);
548 }
549 }
550
551 if (is_bench) // run set of benchmarks
552 {
553 run_benchmark(sv, csv);
554 }
555
556 if (is_timing) // print all collected timings
557 {
558 std::cout << std::endl << "Performance:" << std::endl;
560 }
561 }
562 catch (std::exception& ex)
563 {
564 std::cerr << "Error:" << ex.what() << std::endl;
565 return 1;
566 }
567
568 return 0;
569}
570
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
Algorithms for bvector<> (main include)
Generation of random subset.
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Algorithms for bm::sparse_vector.
Compressed sparse container rsc_sparse_vector<> for integer types.
Serialization for sparse_vector<>
Timing utilities for benchmarking (internal)
const bvector_type * get_null_bvector() const BMNOEXCEPT
Get bit-vector of assigned values or NULL (if not constructed that way)
Definition bmbmatrix.h:329
Constant iterator designed to enumerate "ON" bits.
Definition bm.h:600
bool valid() const BMNOEXCEPT
Checks if iterator is still valid.
Definition bm.h:280
Bitvector Bit-vector container with runtime compression of bits.
Definition bm.h:108
bool test(size_type n) const BMNOEXCEPT
returns true if bit n is set and false is bit n is 0.
Definition bm.h:1440
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
Definition bm.h:1770
void init()
Explicit post-construction initialization.
Definition bm.h:2123
int compare(const bvector< Alloc > &bvect) const BMNOEXCEPT
Lexicographical comparison with a bitvector.
Definition bm.h:3159
void set_bit_no_check(size_type n)
Set bit without checking preconditions (size, etc)
Definition bm.h:3468
Utility class to collect performance measurements and statistics.
Definition bmtimer.h:40
static void print_duration_map(const duration_map_type &dmap, format fmt=ct_time)
Definition bmtimer.h:127
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition bmtimer.h:65
void sample(BV &bv_out, const BV &bv_in, size_type sample_count)
Get random subset of input vector.
Definition bmrandom.h:140
Rank-Select compressed sparse vector.
void load_from(const sparse_vector_type &sv_src)
Load compressed vector from a sparse vector (with NULLs)
void load_to(sparse_vector_type &sv) const
Exort compressed vector to a sparse vector (with NULLs)
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, statistics *stat=0)
run memory optimization for all vector plains
bool empty() const
return true if vector is empty
algorithms for sparse_vector scan/search
void find_eq(const SV &sv, typename SV::value_type value, typename SV::bvector_type &bv_out)
find all sparse vector elements EQ to search value
sparse vector with runtime compression using bit transposition method
Definition bmsparsevec.h:82
bool equal(const sparse_vector< Val, BV > &sv, bm::null_support null_able=bm::use_null) const BMNOEXCEPT
check if another sparse vector has the same content and size
void push_back(value_type v)
push value back into vector
void set(size_type idx, value_type v)
set specified element with bounds checking and automatic resize
const_iterator end() const BMNOEXCEPT
Provide const iterator access to the end
bool empty() const BMNOEXCEPT
return true if vector is empty
const_iterator begin() const BMNOEXCEPT
Provide const iterator access to container content
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector plains
@ use_null
support "non-assigned" or "NULL" logic
Definition bmconst.h:215
bm::chrono_taker::duration_map_type timing_map
Definition sample11.cpp:44
int main(void)
Definition sample1.cpp:38
bm::sparse_vector< unsigned, bm::bvector<> > sparse_vector_u32
Definition xsample02.cpp:66
static int parse_args(int argc, char *argv[])
std::vector< std::pair< unsigned, unsigned > > vector_pairs
bool is_bench
std::string sv_in_name
std::string rsc_in_name
static void build_vector_pairs(const sparse_vector_u32 &sv, vector_pairs &vp)
static bool search_vector_pairs(const vector_pairs &vp, unsigned rs_id, unsigned &pos)
static void show_help()
Definition xsample03.cpp:80
std::string isnp_name
bool is_diag
bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > rsc_sparse_vector_u32
static void generate_random_subset(const sparse_vector_u32 &sv, std::vector< unsigned > &vect, unsigned count)
std::string sv_out_name
Definition xsample03.cpp:99
std::string rsc_out_name
static int load_snp_report(const std::string &fname, sparse_vector_u32 &sv)
static void run_benchmark(const sparse_vector_u32 &sv, const rsc_sparse_vector_u32 &csv)
bool is_timing