libStatGen Software  1
BamIndex.cpp
1 /*
2  * Copyright (C) 2010-2012 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "BamIndex.h"
19 #include <iomanip>
20 
21 BamIndex::BamIndex()
22  : IndexBase(),
23  maxOverallOffset(0),
24  myUnMappedNumReads(-1)
25 {
26 }
27 
28 
29 BamIndex::~BamIndex()
30 {
31 }
32 
33 
34 // Reset the member data for a new index file.
36 {
38 
39  maxOverallOffset = 0;
40  myUnMappedNumReads = -1;
41 }
42 
43 
44 // Read & parse the specified index file.
46 {
47  // Reset the index from anything that may previously be set.
48  resetIndex();
49 
50  IFILE indexFile = ifopen(filename, "rb");
51 
52  // Failed to open the index file.
53  if(indexFile == NULL)
54  {
55  return(SamStatus::FAIL_IO);
56  }
57 
58  // generate the bam index structure.
59 
60  // Read the magic string.
61  char magic[4];
62  if(ifread(indexFile, magic, 4) != 4)
63  {
64  // Failed to read the magic
65  ifclose(indexFile);
66  return(SamStatus::FAIL_IO);
67  }
68 
69  // If this is not an index file, set num references to 0.
70  if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
71  {
72  // Not a BAM Index file.
73  ifclose(indexFile);
74  return(SamStatus::FAIL_PARSE);
75  }
76 
77  // It is a bam index file.
78  // Read the number of reference sequences.
79  if(ifread(indexFile, &n_ref, 4) != 4)
80  {
81  // Failed to read.
82  ifclose(indexFile);
83  return(SamStatus::FAIL_IO);
84  }
85 
86  // Size the references.
87  myRefs.resize(n_ref);
88 
89  for(int refIndex = 0; refIndex < n_ref; refIndex++)
90  {
91  // Read each reference.
92  Reference* ref = &(myRefs[refIndex]);
93 
94  // Read the number of bins.
95  if(ifread(indexFile, &(ref->n_bin), 4) != 4)
96  {
97  // Failed to read the number of bins.
98  // Return failure.
99  ifclose(indexFile);
100  return(SamStatus::FAIL_PARSE);
101  }
102 
103  // If there are no bins, then there are no
104  // mapped/unmapped reads.
105  if(ref->n_bin == 0)
106  {
107  ref->n_mapped = 0;
108  ref->n_unmapped = 0;
109  }
110 
111  // Resize the bins so they can be indexed by bin number.
112  ref->bins.resize(ref->n_bin + 1);
113 
114  // Read each bin.
115  for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
116  {
117  uint32_t binNumber;
118 
119  // Read in the bin number.
120  if(ifread(indexFile, &(binNumber), 4) != 4)
121  {
122  // Failed to read the bin number.
123  // Return failure.
124  ifclose(indexFile);
125  return(SamStatus::FAIL_IO);
126  }
127 
128  // Add the bin to the reference and get the
129  // pointer back so the values can be set in it.
130  Bin* binPtr = &(ref->bins[binIndex]);
131  binPtr->bin = binNumber;
132 
133  // Read in the number of chunks.
134  if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
135  {
136  // Failed to read number of chunks.
137  // Return failure.
138  ifclose(indexFile);
139  return(SamStatus::FAIL_IO);
140  }
141 
142  // Read in the chunks.
143  // Allocate space for the chunks.
144  uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
145  binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
146  if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
147  {
148  // Failed to read the chunks.
149  // Return failure.
150  ifclose(indexFile);
151  return(SamStatus::FAIL_IO);
152  }
153 
154  // Determine the min/max for this bin if it is not the max bin.
155  if(binPtr->bin != MAX_NUM_BINS)
156  {
157  for(int i = 0; i < binPtr->n_chunk; i++)
158  {
159  if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
160  {
161  ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
162  }
163  if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
164  {
165  ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
166  }
167  if(binPtr->chunks[i].chunk_end > maxOverallOffset)
168  {
169  maxOverallOffset = binPtr->chunks[i].chunk_end;
170  }
171  }
172  }
173  else
174  {
175  // Mapped/unmapped are the last chunk of the
176  // MAX BIN
177  ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
178  ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
179  }
180  }
181 
182  // Read the number of intervals.
183  if(ifread(indexFile, &(ref->n_intv), 4) != 4)
184  {
185  // Failed to read, set to 0.
186  ref->n_intv = 0;
187  // Return failure.
188  ifclose(indexFile);
189  return(SamStatus::FAIL_IO);
190  }
191 
192  // Allocate space for the intervals and read them.
193  uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
194  ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
195  if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
196  {
197  // Failed to read the linear index.
198  // Return failure.
199  ifclose(indexFile);
200  return(SamStatus::FAIL_IO);
201  }
202  }
203 
204  int32_t numUnmapped = 0;
205  if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t))
206  {
207  myUnMappedNumReads = numUnmapped;
208  }
209 
210  // Successfully read the bam index file.
211  ifclose(indexFile);
212  return(SamStatus::SUCCESS);
213 }
214 
215 
216 // Get the chunks for the specified reference id and start/end 0-based
217 // coordinates.
218 bool BamIndex::getChunksForRegion(int32_t refID, int32_t start, int32_t end,
219  SortedChunkList& chunkList)
220 {
221  chunkList.clear();
222 
223  // If start is >= to end, there will be no sections, return no
224  // regions.
225  if((start >= end) && (end != -1))
226  {
227  std::cerr << "Warning, requesting region where start <= end, so "
228  << "no values will be returned.\n";
229  return(false);
230  }
231 
232  // Handle REF_ID_UNMAPPED. This uses a default chunk which covers
233  // from the max offset to the end of the file.
234  if(refID == REF_ID_UNMAPPED)
235  {
236  Chunk refChunk;
237  // The start of the unmapped region is the max offset found
238  // in the index file.
239  refChunk.chunk_beg = getMaxOffset();
240  // The end of the unmapped region is the end of the file, so
241  // set chunk end to the max value.
242  refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
243  return(chunkList.insert(refChunk));
244  }
245 
246  if((refID < 0) || (refID >= n_ref))
247  {
248  // The specified refID is out of range, return false.
249  std::cerr << "Warning, requesting refID is out of range, so "
250  << "no values will be returned.\n";
251  return(false);
252  }
253 
254  const Reference* ref = &(myRefs[refID]);
255 
256  // Handle where start/end are defaults.
257  if(start == -1)
258  {
259  if(end == -1)
260  {
261  // This is whole chromosome, so take a shortcut.
262  if(ref->maxChunkOffset == 0)
263  {
264  // No chunks for this region, but this is not an error.
265  return(true);
266  }
267  Chunk refChunk;
268  refChunk.chunk_beg = ref->minChunkOffset;
269  refChunk.chunk_end = ref->maxChunkOffset;
270  return(chunkList.insert(refChunk));
271  }
272  else
273  {
274  start = 0;
275  }
276  }
277  if(end == -1)
278  {
279  // MAX_POSITION is inclusive, but end is exclusive, so add 1.
280  end = MAX_POSITION + 1;
281  }
282 
283  // Determine the minimum offset for the given start position. This
284  // is done by using the linear index for the specified start position.
285  uint64_t minOffset = 0;
286  getMinOffsetFromLinearIndex(refID, start, minOffset);
287 
288  bool binInRangeMap[MAX_NUM_BINS+1];
289 
290  getBinsForRegion(start, end, binInRangeMap);
291 
292  // Loop through the bins in the ref and if they are in the region, get the chunks.
293  for(int i = 0; i < ref->n_bin; ++i)
294  {
295  const Bin* bin = &(ref->bins[i]);
296  if(binInRangeMap[bin->bin] == false)
297  {
298  // This bin is not in the region, so check the next one.
299  continue;
300  }
301 
302  // Add each chunk in the bin to the map.
303  for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
304  {
305  // If the end of the chunk is less than the minimum offset
306  // for the 16K block that starts our region, then no
307  // records in this chunk will cross our region, so do
308  // not add it to the chunks we need to use.
309  if(bin->chunks[chunkIndex].chunk_end < minOffset)
310  {
311  continue;
312  }
313  // Add the chunk to the map.
314  if(!chunkList.insert(bin->chunks[chunkIndex]))
315  {
316  // Failed to add to the map, return false.
317  std::cerr << "Warning, Failed to add a chunk, so "
318  << "no values will be returned.\n";
319  return(false);
320  }
321  }
322  }
323 
324  // Now that all chunks have been added to the list,
325  // handle overlapping chunks.
326  return(chunkList.mergeOverlapping());
327 }
328 
329 
330 // Get the max offset.
331 uint64_t BamIndex::getMaxOffset() const
332 {
333  return(maxOverallOffset);
334 }
335 
336 // Get the min & max file offsets for the reference ID.
337 bool BamIndex::getReferenceMinMax(int32_t refID,
338  uint64_t& minOffset,
339  uint64_t& maxOffset) const
340 {
341  if((refID < 0) || (refID >= (int32_t)myRefs.size()))
342  {
343  // Reference ID is out of range for this index file.
344  return(false);
345  }
346 
347  // Get this reference.
348  minOffset = myRefs[refID].minChunkOffset;
349  maxOffset = myRefs[refID].maxChunkOffset;
350  return(true);
351 }
352 
353 
354 // Get the number of mapped reads for this reference id.
355 int32_t BamIndex::getNumMappedReads(int32_t refID)
356 {
357  // If it is the reference id of unmapped reads, return
358  // that there are no mapped reads.
359  if(refID == REF_ID_UNMAPPED)
360  {
361  // These are by definition all unmapped reads.
362  return(0);
363  }
364 
365  if((refID < 0) || (refID >= (int32_t)myRefs.size()))
366  {
367  // Reference ID is out of range for this index file.
368  return(-1);
369  }
370 
371  // Get this reference.
372  return(myRefs[refID].n_mapped);
373 }
374 
375 
376 // Get the number of unmapped reads for this reference id.
377 int32_t BamIndex::getNumUnMappedReads(int32_t refID)
378 {
379  // If it is the reference id of unmapped reads, return
380  // that value.
381  if(refID == REF_ID_UNMAPPED)
382  {
383  return(myUnMappedNumReads);
384  }
385 
386  if((refID < 0) || (refID >= (int32_t)myRefs.size()))
387  {
388  // Reference ID is out of range for this index file.
389  return(-1);
390  }
391 
392  // Get this reference.
393  return(myRefs[refID].n_unmapped);
394 }
395 
396 
397 // Print the bam index.
398 void BamIndex::printIndex(int32_t refID, bool summary)
399 {
400  std::cout << "BAM Index: " << std::endl;
401  std::cout << "# Reference Sequences: " << n_ref << std::endl;
402 
403  unsigned int startRef = 0;
404  unsigned int endRef = myRefs.size() - 1;
405  std::vector<Reference> refsToProcess;
406  if(refID != -1)
407  {
408  // Set start and end ref to the specified reference id.
409  startRef = refID;
410  endRef = refID;
411  }
412 
413  // Print out the information for each bin.
414  for(unsigned int i = startRef; i <= endRef; ++i)
415  {
416  std::cout << std::dec
417  << "\tReference ID: " << std::setw(4) << i
418  << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin
419  << "; #Linear Index Entries: "
420  << std::setw(6) << myRefs[i].n_intv
421  << "; Min Chunk Offset: "
422  << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
423  << "; Max Chunk Offset: "
424  << std::setw(18) << myRefs[i].maxChunkOffset
425  << std::dec;
426  // Print the mapped/unmapped if set.
427  if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
428  {
429  std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads";
430  }
431  if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
432  {
433  std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads";
434  }
435  std::cout << std::endl;
436 
437  // Only print more details if not summary.
438  if(!summary)
439  {
440  std::vector<Bin>::iterator binIter;
441  for(binIter = myRefs[i].bins.begin();
442  binIter != myRefs[i].bins.end();
443  ++binIter)
444  {
445  Bin* binPtr = &(*binIter);
446  if(binPtr->bin == Bin::NOT_USED_BIN)
447  {
448  // This bin is not used, continue.
449  continue;
450  }
451  // Print the bin info.
452  std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
453  std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
454  std::cout << std::hex << std::showbase;
455 
456  for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
457  ++chunkIndex)
458  {
459  // If this is the last chunk of the MAX_NUM_BINS - it
460  // contains a mapped/unmapped count rather than the regular
461  // chunk addresses.
462  if((binPtr->bin != MAX_NUM_BINS) ||
463  (chunkIndex != (binPtr->n_chunk - 1)))
464  {
465  std::cout << "\t\t\t\tchunk_beg: "
466  << binPtr->chunks[chunkIndex].chunk_beg
467  << std::endl;
468  std::cout << "\t\t\t\tchunk_end: "
469  << binPtr->chunks[chunkIndex].chunk_end
470  << std::endl;
471  }
472  }
473  }
474  std::cout << std::dec;
475 
476  // Print the linear index.
477  for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
478  ++linearIndex)
479  {
480  if(myRefs[i].ioffsets[linearIndex] != 0)
481  {
482  std::cout << "\t\t\tLinearIndex["
483  << std::dec << linearIndex << "] Offset: "
484  << std::hex << myRefs[i].ioffsets[linearIndex]
485  << std::endl;
486  }
487  }
488  }
489  }
490 }
IndexBase::resetIndex
virtual void resetIndex()
Reset the member data for a new index file.
Definition: IndexBase.cpp:130
SortedChunkList
Definition: IndexBase.h:48
IndexBase::Bin
Definition: IndexBase.h:97
BamIndex::getChunksForRegion
bool getChunksForRegion(int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList)
Get the list of chunks associated with this region.
Definition: BamIndex.cpp:218
ifopen
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
BamIndex::getNumMappedReads
int32_t getNumMappedReads(int32_t refID)
Get the number of mapped reads for this reference id.
Definition: BamIndex.cpp:355
StatGenStatus::SUCCESS
@ SUCCESS
method completed successfully.
Definition: StatGenStatus.h:32
BamIndex::getReferenceMinMax
bool getReferenceMinMax(int32_t refID, uint64_t &minOffset, uint64_t &maxOffset) const
Get the minimum and maximum file offsets for the specfied reference ID.
Definition: BamIndex.cpp:337
StatGenStatus::FAIL_PARSE
@ FAIL_PARSE
failed to parse a record/header - invalid format.
Definition: StatGenStatus.h:42
BamIndex::printIndex
void printIndex(int32_t refID, bool summary=false)
Print the index information.
Definition: BamIndex.cpp:398
BamIndex::getNumUnMappedReads
int32_t getNumUnMappedReads(int32_t refID)
Get the number of unmapped reads for this reference id.
Definition: BamIndex.cpp:377
IndexBase
Definition: IndexBase.h:62
StatGenStatus::Status
Status
Return value enum for StatGenFile methods.
Definition: StatGenStatus.h:31
ifread
unsigned int ifread(IFILE file, void *buffer, unsigned int size)
Read up to size bytes from the file into the buffer.
Definition: InputFile.h:600
InputFile
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:36
StatGenStatus::FAIL_IO
@ FAIL_IO
method failed due to an I/O issue.
Definition: StatGenStatus.h:37
Chunk
Definition: IndexBase.h:30
IndexBase::Reference
Definition: IndexBase.h:118
BamIndex::readIndex
SamStatus::Status readIndex(const char *filename)
Definition: BamIndex.cpp:45
ifclose
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
BamIndex::resetIndex
virtual void resetIndex()
Reset the member data for a new index file.
Definition: BamIndex.cpp:35
BamIndex::REF_ID_UNMAPPED
static const int32_t REF_ID_UNMAPPED
The number used for the reference id of unmapped reads.
Definition: BamIndex.h:86