libStatGen Software  1
GenomeSequence.h
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef _GENOME_SEQUENCE_H
19 #define _GENOME_SEQUENCE_H
20 
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #if !defined(MD5_DIGEST_LENGTH)
24 #define MD5_DIGEST_LENGTH 16
25 #endif
26 #include <string>
27 #include "MemoryMapArray.h"
28 #include "BaseAsciiMap.h"
29 
30 // Goncalo's String class
31 #include "StringArray.h"
32 
33 #include <stdint.h>
34 
35 // stdint.h will define this, but only if __STDC_LIMIT_MACROS was
36 // defined prior to the first include of stdint.h
37 #ifndef UINT32_MAX
38 #define UINT32_MAX 0xFFFFFFFF
39 #endif
40 
41 typedef uint32_t genomeIndex_t;
42 #define INVALID_GENOME_INDEX UINT32_MAX
43 
44 // chromosome index is just a signed int, so this is ok here:
45 #define INVALID_CHROMOSOME_INDEX -1
46 
47 #include "GenomeSequenceHelpers.h"
48 
49 #define UMFA_COOKIE 0x1b7933a1 // unique cookie id
50 #define UMFA_VERSION 20100401U // YYYYMMDD of last change to file layout
51 
52 typedef MemoryMapArray<
53 uint32_t,
54 genomeIndex_t,
55 UMFA_COOKIE,
56 UMFA_VERSION,
57 PackedAccess_4Bit,
58 PackedAssign_4Bit,
59 Packed4BitElementCount2Bytes,
62 
63 
64 // std::string &operator = (std::string &lhs, const PackedRead &rhs);
65 //
66 
67 //! Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
68 /*!
69  GenomeSequence is designed to be a high performance shared access
70  reference object.
71 
72  It is implemented as a MemoryMapArray template object with unsigned 8
73  bit ints, each of which stores two bases. Although 2 bits could be used,
74  most references have more than four symbols (usually at least including
75  'N', indicating an unknown or masked out base).
76 
77  Normal use of this class follows these steps:
78  -# create the reference
79  -# instantiate the GenomeSequence class object
80  -# create the actual file (memory mapped) that is to hold the data
81  -# populate the data using GenomeSequence::set
82  -# use the reference
83  -# use the reference by instantiating a GenomeSequence object
84  -# either use the constructor with the reference filename
85  -# or use GenomeSequence::setReferenceName() followed by ::open
86  -# access the bases via the overloaded array operator []
87  -# check sequence length by using GenomeSequence::getNumberBases()
88  -# accessing chromosomes in the reference
89  -# you typically will need to know about the chromosomes in the sequence
90  -# see methods and docs with prefix 'getChromosome'
91 
92  Sharing is accomplished using the mmap() function via the MemoryMap
93  base class. This allows a potentially large genome reference to be
94  shared among a number of simultaneously executing instances of one or
95  more programs sharing the same reference.
96  */
97 
98 
100 {
101 private:
102  int _debugFlag;
103 
104  std::ostream *_progressStream;
105  bool _colorSpace;
106 
107  // Whether or not to overwrite an existing file when creating a umfa file (via create).
108  bool _createOverwrite;
109 
110  std::string _baseFilename; // for later use by WordIndex create and open
111  std::string _referenceFilename;
112  std::string _fastaFilename;
113  std::string _umfaFilename;
114  std::string _application; // only used in ::create()
115 
116  MemoryMap _umfaFile;
117 
118  void setup(const char *referenceFilename);
119 
120 public:
121  /// Simple constructor - no implicit file open
122  GenomeSequence();
123  void constructorClear();
124  /// \brief attempt to open an existing sequence
125  ///
126  /// \param referenceFilename the name of the reference fasta file to open
127  /// \param debug if true, additional debug information is printed
128  GenomeSequence(std::string &referenceFilename)
129  {
130  constructorClear();
131  setup(referenceFilename.c_str());
132  }
133 
134  /// Smarter constructor - attempt to open an existing sequence
135  ///
136  /// \param referenceFilename the name of the reference fasta file to open
137  /// \param debug if true, additional debug information is printed
138  GenomeSequence(const char *referenceFilename)
139  {
140  constructorClear();
141  setup(referenceFilename);
142  }
143 
144  /// Close the file if open and destroy the object
145  ~GenomeSequence();
146 
147  /// open the reference specified using GenomeSequence::setReferenceName
148  ///
149  /// \param isColorSpace open the color space reference
150  /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
151  /// \return false for success, true otherwise
152  bool open(bool isColorSpace = false, int flags = O_RDONLY);
153 
154  /// open the given file as the genome (no filename munging occurs).
155  ///
156  /// \param filename the name of the file to open
157  /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
158  /// \return false for success, true otherwise
159  bool open(const char *filename, int flags = O_RDONLY)
160  {
161  _umfaFilename = filename;
162  // TODO - should this method be doing something???
163  return false;
164  }
165 
166 private:
167  bool _searchCommonFileSuffix;
168 public:
169  bool create(bool isColor = false);
170 
171  // NEW API?
172 
173  // load time modifiers:
174  /// if set, then show progress when creating and pre-fetching
175  void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
176  void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
177  void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
178 
179  // Set whether or not to overwrite a umfa file when calling create.
180  void setCreateOverwrite(bool createOverwrite) {_createOverwrite = createOverwrite;}
181 
182  bool loadFastaData(const char *filename);
183 
184  /// set the reference name that will be used in open()
185  /// \param referenceFilename the name of the reference fasta file to open
186  /// \return false for success, true otherwise
187  ///
188  /// \sa open()
189  bool setReferenceName(std::string referenceFilename);
190 
191  /// set the application name in the binary file header
192  ///
193  /// \param application name of the application
194  void setApplication(std::string application)
195  {
196  _application = application; // used in ::create() to set application name
197  }
198  const std::string &getFastaName() const
199  {
200  return _fastaFilename;
201  }
202  const std::string &getReferenceName() const
203  {
204  return _referenceFilename;
205  }
206 
207  /// tell us if we are a color space reference or not
208  /// \return true if colorspace, false otherwise
209  bool isColorSpace() const
210  {
211  return _colorSpace;
212  }
213 
214  /// return the number of bases represented in this reference
215  /// \return count of bases
216  genomeIndex_t getNumberBases() const
217  {
218  return getElementCount();
219  }
220 
221  /// given a whole genome index, get the chromosome it is located in
222  ///
223  /// This is done via a binary search of the chromosome table in the
224  /// header of the mapped file, so it is O(log(N))
225  ///
226  /// \param 0-based position the base in the genome
227  /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
228  int getChromosome(genomeIndex_t position) const;
229 
230  /// given a chromosome name, return the chromosome index
231  ///
232  /// This is done via a linear search of the chromosome table in the
233  /// header of the mapped file, so it is O(N)
234  ///
235  /// \param chromosomeName the name of the chromosome - exact match only
236  /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
237  int getChromosome(const char *chromosomeName) const;
238  /// Return the number of chromosomes in the genome
239  /// \return number of chromosomes in the genome
240  int getChromosomeCount() const;
241 
242  /// given a chromosome, return the genome base it starts in
243  ///
244  /// \param 0-based chromosome index
245  /// \return 0-based genome index of the base that starts the chromosome
246  genomeIndex_t getChromosomeStart(int chromosomeIndex) const
247  {
248  if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
249  return header->_chromosomes[chromosomeIndex].start;
250  }
251 
252  /// given a chromosome, return its size in bases
253  ///
254  /// \param 0-based chromosome index
255  /// \return size of the chromosome in bases
256  genomeIndex_t getChromosomeSize(int chromosomeIndex) const
257  {
258  if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
259  return header->_chromosomes[chromosomeIndex].size;
260  }
261 
262  /// given a chromosome name and position, return the genome position
263  ///
264  /// \param chromosomeName name of the chromosome - exact match only
265  /// \param chromosomeIndex 1-based chromosome position
266  /// \return genome index of the above chromosome position
267  genomeIndex_t getGenomePosition(
268  const char *chromosomeName,
269  unsigned int chromosomeIndex) const;
270 
271  /// given a chromosome index and position, return the genome position
272  ///
273  /// \param chromosome index of the chromosome
274  /// \param chromosomeIndex 1-based chromosome position
275  /// \return genome index of the above chromosome position
276  genomeIndex_t getGenomePosition(
277  int chromosome,
278  unsigned int chromosomeIndex) const;
279 
280  /// given the chromosome name, get the corresponding 0 based genome index
281  /// for the start of that chromosome
282  genomeIndex_t getGenomePosition(const char *chromosomeName) const;
283  genomeIndex_t getGenomePosition(int chromosomeIndex) const;
284 
285  const std::string &getBaseFilename() const
286  {
287  return _baseFilename;
288  }
289 
290  const char *getChromosomeName(int chromosomeIndex) const
291  {
292  return header->_chromosomes[chromosomeIndex].name;
293  }
294 
295  void setDebugFlag(bool d)
296  {
297  _debugFlag = d;
298  }
299 
300  genomeIndex_t sequenceLength() const
301  {
302  return (genomeIndex_t) header->elementCount;
303  }
304 
305  const char *chromosomeName(int chr) const
306  {
307  return header->_chromosomes[chr].name;
308  }
309 
310  void sanityCheck(MemoryMap &fasta) const;
311 
312  // TODO - this will be moved somewhere else and be made a static method.
313  std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const;
314 
315  bool wordMatch(unsigned int index, std::string &word) const;
316  bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const;
317 
318  // TODO - this will be moved somewhere else and be made a static method.
319  char BasePair(char c) const
320  {
321  return BaseAsciiMap::base2complement[(int) c];
322  }
323 
324  void dumpSequenceSAMDictionary(std::ostream&) const;
325  void dumpHeaderTSV(std::ostream&) const;
326 
327  ///
328  /// Return the bases in base space or color space for within range index, ot
329  /// @param index the array-like index (0 based).
330  /// @return ACTGN in base space; 0123N for color space; and 'N' for invalid.
331  /// For color space, index i represents the transition of base at position (i-1) to base at position i
332  ///
333  /// NB: bounds checking here needs to be deprecated - do not assume it
334  /// will exist - the call must clip reads so that this routine is never
335  /// called with a index value larger than the genome.
336  ///
337  /// The reason for this is simply that this routine gets called hundreds
338  /// of billions of time in one run of karma, which will absolutely kill
339  /// performance. Every single instruction here matters a great, great deal.
340  ///
341 
342  //
343  // 3.5% improvement for color space matching:
344  // I guess the compiler isn't inlining 3 functions deep.
345  //
346 
347 #if 0
348  // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
349  // The following code does not work even in the base space,
350  // since the memory layout has changed. USE IT WITH CAUTIOUS!
351  //
352  // This block of code is a functional duplicate of the following
353  // code - leave this here for reference and possibly later
354  // performance testing as well as compiler evaluation.
355  inline char operator[](genomeIndex_t index) const
356  {
357  return BaseAsciiMap::int2base[(*((genomeSequenceArray*) this))[index]];
358  }
359 #endif
360 
361  inline char operator[](genomeIndex_t index) const
362  {
363  uint8_t val;
364  if (index < getNumberBases())
365  {
366  if ((index&1)==0)
367  {
368  val = ((uint8_t *) data)[index>>1] & 0xf;
369  }
370  else
371  {
372  val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
373  }
374  }
375  else
376  {
378  }
381  return val;
382  }
383 
384  /// given a chromosome name and 1-based position, return the reference base.
385  /// \param chromosomeName name of the chromosome - exact match only
386  /// \param chromosomeIndex 1-based chromosome position
387  /// \return reference base at the above chromosome position
388  inline char getBase(const char *chromosomeName,
389  unsigned int chromosomeIndex) const
390  {
391  genomeIndex_t index =
392  getGenomePosition(chromosomeName, chromosomeIndex);
393  if(index == INVALID_GENOME_INDEX)
394  {
395  // Invalid position, so return 'N'
396  return('N');
397  }
398  return((*this)[index]);
399  }
400 
401 
402  inline uint8_t getInteger(genomeIndex_t index) const
403  {
404  return (*((genomeSequenceArray*) this))[index];
405  }
406 
407  inline void set(genomeIndex_t index, char value)
408  {
409  genomeSequenceArray::set(index,
410  BaseAsciiMap::base2int[(uint8_t) value]);
411  }
412 
413  /// obtain the pointer to the raw data for other access methods
414  ///
415  /// this is a fairly ugly hack to reach into the
416  /// raw genome vector, get the byte that encodes
417  /// two bases, and return it. This is used by
418  /// karma ReadIndexer::getSumQ to compare genome
419  /// matchines by byte (two bases at a time) to speed
420  /// it up.
421  ///
422  uint8_t *getDataPtr(genomeIndex_t index)
423  {
424  return ((uint8_t *) data + index/2);
425  }
426 private:
427  ///
428  /// when creating the genome mapped file, we call this to set
429  /// the MD5 checksum of the chromosome sequence and length so that
430  /// we can write the SAM SQ headers properly.
431  /// NB: operates on the last fully loaded chromosome.
432  bool setChromosomeMD5andLength(uint32_t whichChromosome);
433 
434 public:
435 
436  // TODO - this will be moved somewhere else and be made a static method.
437  // replace read with the reversed one
438  void getReverseRead(std::string &read);
439 
440  // TODO - this will be moved somewhere else and be made a static method.
441  void getReverseRead(String& read);
442 
443  // debug the given read - print nice results
444  int debugPrintReadValidation(
445  std::string &read,
446  std::string &quality,
447  char direction,
448  genomeIndex_t readLocation,
449  int sumQuality,
450  int mismatchCount,
451  bool recurse = true
452  );
453 
454  //
455  // get the sequence from this GenomeSequence using the specified chromosome and 0-based position.
456  // if baseCount < 0, get the reverse complement
457  // that starts at index (but do not reverse the string?)
458  void getString(std::string &str, int chromosome, uint32_t index, int baseCount) const;
459  void getString(String &str, int chromosome, uint32_t index, int baseCount) const;
460  //
461  // get the sequence from this GenomeSequence.
462  // if baseCount < 0, get the reverse complement
463  // that starts at index (but do not reverse the string?)
464  //
465  void getString(std::string &str, genomeIndex_t index, int baseCount) const;
466  void getString(String &str, genomeIndex_t index, int baseCount) const;
467 
468  void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const;
469 
470  void print30(genomeIndex_t) const;
471 
472  // for debugging, not for speed:
473  genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const;
474 
475 
476  // TODO - these methods do not handle a CIGAR string and do not handle '=' when a read matches the reference.
477  // They are here for alignment and should be moved to the aligner (karma).
478  // OR they should optionally take a CIGAR and use that if specified....
479  // maybe they should be helper methods that are found somewhere else
480 
481  /// Return the mismatch count, disregarding CIGAR strings
482  ///
483  /// \param read is the sequence we're counting mismatches in
484  /// \param location is where in the genmoe we start comparing
485  /// \param exclude is a wildcard character (e.g. '.' or 'N')
486  ///
487  /// \return number of bases that don't match the reference, except those that match exclude
488  int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
489  {
490  int mismatchCount = 0;
491  for (uint32_t i=0; i<read.size(); i++)
492  if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
493  return mismatchCount;
494  };
495 
496  /// brute force sumQ - no sanity checking
497  ///
498  /// \param read shotgun sequencer read string
499  /// \param qualities phred quality string of same length
500  /// \param location the alignment location to check sumQ
501  int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
502  {
503  int sumQ = 0;
504  for (uint32_t i=0; i<read.size(); i++)
505  sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
506  return sumQ;
507  };
508  // return a string highlighting mismatch postions with '^' chars:
509  void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const;
510  void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const;
511 
512  // END TODO
513 
514  void getChromosomeAndIndex(std::string &, genomeIndex_t) const;
515  void getChromosomeAndIndex(String &, genomeIndex_t) const;
516 
517  /// check a SAM format read, using phred quality scores and
518  /// the CIGAR string to determine if it is correct.
519  ///
520  ///
521  /// \param read the read in base space
522  /// \param qualities the phred encoded qualities (Sanger, not Illumina)
523  /// \param cigar the SAM file CIGAR column
524  /// \param sumQ if >0 on entry, is checked against the computed sumQ
525  /// \param insertions count of insertions found in
526  ///
527  ///
528  bool checkRead(
529  std::string &read,
530  std::string &qualities,
531  std::string &cigar,
532  int &sumQ, // input and output
533  int &gapOpenCount, // output only
534  int &gapExtendCount, // output only
535  int &gapDeleteCount, // output only
536  std::string &result
537  ) const;
538 
539  bool populateDBSNP(mmapArrayBool_t &dbSNP, IFILE inputFile) const;
540 
541  /// user friendly dbSNP loader.
542  ///
543  /// \param inputFileName may be empty, point to a text file or a dbSNP vector file
544  ///
545  /// In all cases, dbSNP is returned the same length as this genome.
546  ///
547  /// When no SNPs are loaded, all values are false.
548  ///
549  /// When a text file is given, the file is parsed with two space
550  /// separated columns - the first column is the chromosome name, and
551  /// the second is the 1-based chromosome position of the SNP.
552  ///
553  /// \return false if a dbSNP file was correctly loaded, true otherwise
554  ///
555  bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const;
556 };
557 
558 #endif
GenomeSequence::setApplication
void setApplication(std::string application)
set the application name in the binary file header
Definition: GenomeSequence.h:194
BaseAsciiMap::baseNIndex
static const int baseNIndex
Value associated with 'N' in the ascii to base map (bad read).
Definition: BaseAsciiMap.h:28
GenomeSequence::GenomeSequence
GenomeSequence(std::string &referenceFilename)
attempt to open an existing sequence
Definition: GenomeSequence.h:128
GenomeSequence::open
bool open(const char *filename, int flags=O_RDONLY)
open the given file as the genome (no filename munging occurs).
Definition: GenomeSequence.h:159
GenomeSequence::~GenomeSequence
~GenomeSequence()
Close the file if open and destroy the object.
Definition: GenomeSequence.cpp:169
String
Definition: StringBasics.h:38
GenomeSequence::getChromosomeStart
genomeIndex_t getChromosomeStart(int chromosomeIndex) const
given a chromosome, return the genome base it starts in
Definition: GenomeSequence.h:246
GenomeSequence
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Definition: GenomeSequence.h:99
GenomeSequence::GenomeSequence
GenomeSequence()
Simple constructor - no implicit file open.
Definition: GenomeSequence.cpp:139
GenomeSequence::GenomeSequence
GenomeSequence(const char *referenceFilename)
Smarter constructor - attempt to open an existing sequence.
Definition: GenomeSequence.h:138
GenomeSequence::getBase
char getBase(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and 1-based position, return the reference base.
Definition: GenomeSequence.h:388
GenomeSequence::getChromosomeCount
int getChromosomeCount() const
Return the number of chromosomes in the genome.
Definition: GenomeSequence.cpp:731
GenomeSequence::getChromosomeSize
genomeIndex_t getChromosomeSize(int chromosomeIndex) const
given a chromosome, return its size in bases
Definition: GenomeSequence.h:256
BaseAsciiMap::base2int
static unsigned char base2int[256+1]
Map ASCII values to a 2 (or 3) bit encoding for the base pair value for just base space (ACTGNactgn).
Definition: BaseAsciiMap.h:61
GenomeSequence::checkRead
bool checkRead(std::string &read, std::string &qualities, std::string &cigar, int &sumQ, int &gapOpenCount, int &gapExtendCount, int &gapDeleteCount, std::string &result) const
check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correc...
GenomeSequence::getChromosome
int getChromosome(genomeIndex_t position) const
given a whole genome index, get the chromosome it is located in
Definition: GenomeSequence.cpp:737
MemoryMap
There are a pair of related data structures in the operating system, and also a few simple algorithms...
Definition: MemoryMap.h:155
GenomeSequence::getMismatchCount
int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
Return the mismatch count, disregarding CIGAR strings.
Definition: GenomeSequence.h:488
MemoryMapArray
Definition: MemoryMapArray.h:141
genomeSequenceMmapHeader
Definition: GenomeSequenceHelpers.h:79
GenomeSequence::getDataPtr
uint8_t * getDataPtr(genomeIndex_t index)
obtain the pointer to the raw data for other access methods
Definition: GenomeSequence.h:422
InputFile
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:36
BaseAsciiMap::int2colorSpace
static const char int2colorSpace[]
Convert from int representation to colorspace representation.
Definition: BaseAsciiMap.h:40
GenomeSequence::isColorSpace
bool isColorSpace() const
tell us if we are a color space reference or not
Definition: GenomeSequence.h:209
GenomeSequence::setReferenceName
bool setReferenceName(std::string referenceFilename)
set the reference name that will be used in open()
Definition: GenomeSequence.cpp:254
GenomeSequence::operator[]
char operator[](genomeIndex_t index) const
Return the bases in base space or color space for within range index, ot.
Definition: GenomeSequence.h:361
GenomeSequence::getSumQ
int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
brute force sumQ - no sanity checking
Definition: GenomeSequence.h:501
GenomeSequence::setProgressStream
void setProgressStream(std::ostream &progressStream)
if set, then show progress when creating and pre-fetching
Definition: GenomeSequence.h:175
GenomeSequence::open
bool open(bool isColorSpace=false, int flags=O_RDONLY)
open the reference specified using GenomeSequence::setReferenceName
Definition: GenomeSequence.cpp:182
GenomeSequence::loadDBSNP
bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const
user friendly dbSNP loader.
Definition: GenomeSequence.cpp:1301
BaseAsciiMap::int2base
static const char int2base[]
Convert from int representation to the base.
Definition: BaseAsciiMap.h:38
BaseAsciiMap::base2complement
static unsigned char base2complement[]
This table maps 5' base space to the 3' complement base space values, as well as 5' color space value...
Definition: BaseAsciiMap.h:41
GenomeSequence::getGenomePosition
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
Definition: GenomeSequence.cpp:779
GenomeSequence::getNumberBases
genomeIndex_t getNumberBases() const
return the number of bases represented in this reference
Definition: GenomeSequence.h:216