libStatGen Software  1
SamQuerySeqWithRefHelper.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdexcept>
19 
20 #include "SamQuerySeqWithRefHelper.h"
21 #include "BaseUtilities.h"
22 #include "SamFlag.h"
23 
24 SamQuerySeqWithRefIter::SamQuerySeqWithRefIter(SamRecord& record,
25  GenomeSequence& refSequence,
26  bool forward)
27  : myRecord(record),
28  myRefSequence(refSequence),
29  myCigar(NULL),
30  myStartOfReadOnRefIndex(INVALID_GENOME_INDEX),
31  myQueryIndex(0),
32  myForward(forward)
33 {
34  myCigar = myRecord.getCigarInfo();
35  myStartOfReadOnRefIndex =
36  refSequence.getGenomePosition(myRecord.getReferenceName());
37 
38  if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
39  {
40  // This reference name was found in the reference file, so
41  // add the start position.
42  myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
43  }
44 
45  if(!forward)
46  {
47  myQueryIndex = myRecord.getReadLength() - 1;
48  }
49 }
50 
51 
52 SamQuerySeqWithRefIter::~SamQuerySeqWithRefIter()
53 {
54 }
55 
56 
57 
59 {
60  myCigar = myRecord.getCigarInfo();
61  if(myCigar == NULL)
62  {
63  // Failed to get Cigar.
64  return(false);
65  }
66 
67  // Get where the position of where this read starts as mapped to the
68  // reference.
69  myStartOfReadOnRefIndex =
70  myRefSequence.getGenomePosition(myRecord.getReferenceName());
71 
72  if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
73  {
74  // This reference name was found in the reference file, so
75  // add the start position.
76  myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
77  }
78 
79  myForward = forward;
80 
81  if(myForward)
82  {
83  myQueryIndex = 0;
84  }
85  else
86  {
87  // reverse, so start at the last entry.
88  myQueryIndex = myRecord.getReadLength() - 1;
89  }
90  return(true);
91 }
92 
93 
94 // Returns information for the next position where the query and the
95 // reference match or mismatch. To be a match or mismatch, both the query
96 // and reference must have a base that is not 'N'.
97 // This means:
98 // insertions and deletions are not mismatches or matches.
99 // 'N' bases are not matches or mismatches
100 // Returns true if an entry was found, false if there are no more matches or
101 // mismatches.
103 {
104  // Check whether or not this read is mapped.
105  // If the read is not mapped, return no matches.
106  if(!SamFlag::isMapped(myRecord.getFlag()))
107  {
108  // Not mapped.
109  return(false);
110  }
111 
112  // Check that the Cigar is set.
113  if(myCigar == NULL)
114  {
115  // Error.
116  throw(std::runtime_error("Cannot determine matches/mismatches since failed to retrieve the cigar"));
117  return(false);
118  }
119 
120  // If myStartOfReadOnRefIndex is the default (unset) value, then
121  // the reference was not found, so return false, no matches/mismatches.
122  if(myStartOfReadOnRefIndex == INVALID_GENOME_INDEX)
123  {
124  // This reference name was not found in the reference file, so just
125  // return no matches/mismatches.
126  return(false);
127  }
128 
129 
130  // Repull the read length from the record to check just in case the
131  // record has changed length.
132  // Loop until a match or mismatch is found as long as query index
133  // is still within the read (Loop is broken by a return).
134  while((myQueryIndex < myRecord.getReadLength()) &&
135  (myQueryIndex >= 0))
136  {
137  // Still more bases, look for a match/mismatch.
138 
139  // Get the reference offset for this read position.
140  int32_t refOffset = myCigar->getRefOffset(myQueryIndex);
141  if(refOffset == Cigar::INDEX_NA)
142  {
143  // This is either a softclip or an insertion
144  // which do not count as a match or a mismatch, so
145  // go to the next index.
146  nextIndex();
147  continue;
148  }
149 
150  // Both the reference and the read have a base, so get the bases.
151  char readBase = myRecord.getSequence(myQueryIndex, SamRecord::NONE);
152  char refBase = myRefSequence[myStartOfReadOnRefIndex + refOffset];
153 
154  // If either the read or the reference bases are unknown, then
155  // it does not count as a match or a mismatch.
156  if(BaseUtilities::isAmbiguous(readBase) ||
158  {
159  // Either the reference base or the read base are unknown,
160  // so skip this position.
161  nextIndex();
162  continue;
163  }
164 
165  // Both the read & the reference have a known base, so it is either
166  // a match or a mismatch.
167  matchMismatchInfo.setQueryIndex(myQueryIndex);
168 
169  // Check if they are equal.
170  if(BaseUtilities::areEqual(readBase, refBase))
171  {
172  // Match.
173  matchMismatchInfo.setType(SamSingleBaseMatchInfo::MATCH);
174  // Increment the query index to the next position.
175  nextIndex();
176  return(true);
177  }
178  else
179  {
180  // Mismatch
181  matchMismatchInfo.setType(SamSingleBaseMatchInfo::MISMATCH);
182  // Increment the query index to the next position.
183  nextIndex();
184  return(true);
185  }
186  }
187 
188  // No matches or mismatches were found, so return false.
189  return(false);
190 }
191 
192 
193 void SamQuerySeqWithRefIter::nextIndex()
194 {
195  if(myForward)
196  {
197  ++myQueryIndex;
198  }
199  else
200  {
201  --myQueryIndex;
202  }
203 }
204 
205 
206 SamSingleBaseMatchInfo::SamSingleBaseMatchInfo()
207  : myType(UNKNOWN),
208  myQueryIndex(0)
209 {
210 }
211 
212 
213 SamSingleBaseMatchInfo::~SamSingleBaseMatchInfo()
214 {
215 }
216 
217 
219 {
220  return(myType);
221 }
222 
223 
225 {
226  return(myQueryIndex);
227 }
228 
229 
231 {
232  myType = newType;
233 }
234 
235 
237 {
238  myQueryIndex = queryIndex;
239 }
240 
241 
242 ///////////////////////////////////////////////////////////////////////////
243 void SamQuerySeqWithRef::seqWithEquals(const char* currentSeq,
244  int32_t seq0BasedPos,
245  Cigar& cigar,
246  const char* referenceName,
247  const GenomeSequence& refSequence,
248  std::string& updatedSeq)
249 {
250  updatedSeq = currentSeq;
251 
252  int32_t seqLength = updatedSeq.length();
253  int32_t queryIndex = 0;
254 
255  uint32_t startOfReadOnRefIndex =
256  refSequence.getGenomePosition(referenceName);
257 
258  if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
259  {
260  // This reference name was not found in the reference file, so just
261  // return.
262  return;
263  }
264  startOfReadOnRefIndex += seq0BasedPos;
265 
266  // Loop until the entire sequence has been updated.
267  while(queryIndex < seqLength)
268  {
269  // Still more bases, look for matches.
270 
271  // Get the reference offset for this read position.
272  int32_t refOffset = cigar.getRefOffset(queryIndex);
273  if(refOffset != Cigar::INDEX_NA)
274  {
275  // Both the reference and the read have a base, so get the bases.
276  char readBase = currentSeq[queryIndex];
277  char refBase = refSequence[startOfReadOnRefIndex + refOffset];
278 
279  // If neither base is unknown and they are the same, count it
280  // as a match.
281  if(!BaseUtilities::isAmbiguous(readBase) &&
282  !BaseUtilities::isAmbiguous(refBase) &&
283  (BaseUtilities::areEqual(readBase, refBase)))
284  {
285  // Match.
286  updatedSeq[queryIndex] = '=';
287  }
288  }
289  // Increment the query index to the next position.
290  ++queryIndex;
291  continue;
292  }
293 }
294 
295 
296 void SamQuerySeqWithRef::seqWithoutEquals(const char* currentSeq,
297  int32_t seq0BasedPos,
298  Cigar& cigar,
299  const char* referenceName,
300  const GenomeSequence& refSequence,
301  std::string& updatedSeq)
302 {
303  updatedSeq = currentSeq;
304 
305  int32_t seqLength = updatedSeq.length();
306  int32_t queryIndex = 0;
307 
308  uint32_t startOfReadOnRefIndex =
309  refSequence.getGenomePosition(referenceName);
310 
311  if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
312  {
313  // This reference name was not found in the reference file, so just
314  // return.
315  return;
316  }
317  startOfReadOnRefIndex += seq0BasedPos;
318 
319  // Loop until the entire sequence has been updated.
320  while(queryIndex < seqLength)
321  {
322  // Still more bases, look for matches.
323 
324  // Get the reference offset for this read position.
325  int32_t refOffset = cigar.getRefOffset(queryIndex);
326  if(refOffset != Cigar::INDEX_NA)
327  {
328  // Both the reference and the read have a base, so get the bases.
329  char readBase = currentSeq[queryIndex];
330  char refBase = refSequence[startOfReadOnRefIndex + refOffset];
331 
332  // If the bases are equal, set the sequence to the reference
333  // base. (Skips the check for ambiguous to catch a case where
334  // ambiguous had been converted to a '=', and if both are ambiguous,
335  // it will still be set to ambiguous.)
336  if(BaseUtilities::areEqual(readBase, refBase))
337  {
338  // Match.
339  updatedSeq[queryIndex] = refBase;
340  }
341  }
342 
343  // Increment the query index to the next position.
344  ++queryIndex;
345  continue;
346  }
347 }
Cigar
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition: Cigar.h:83
SamRecord::get0BasedPosition
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1307
SamRecord::getReferenceName
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1286
BaseUtilities::isAmbiguous
static bool isAmbiguous(char base)
Returns whether or not the specified bases is an indicator for ambiguity.
Definition: BaseUtilities.cpp:23
SamSingleBaseMatchInfo::setType
void setType(Type newType)
Set the type (match/mismatch/unkown) for this object.
Definition: SamQuerySeqWithRefHelper.cpp:230
SamQuerySeqWithRefIter::reset
bool reset(bool forward=true)
Reset to start at the beginning of the record.
Definition: SamQuerySeqWithRefHelper.cpp:58
SamSingleBaseMatchInfo::setQueryIndex
void setQueryIndex(int32_t queryIndex)
Set the query index for this object.
Definition: SamQuerySeqWithRefHelper.cpp:236
SamQuerySeqWithRefIter::getNextMatchMismatch
bool getNextMatchMismatch(SamSingleBaseMatchInfo &matchMismatchInfo)
Returns information for the next position where the query and the reference match or mismatch.
Definition: SamQuerySeqWithRefHelper.cpp:102
GenomeSequence
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Definition: GenomeSequence.h:99
SamSingleBaseMatchInfo
This class contains the match/mismatch information between the reference and a read for a single base...
Definition: SamQuerySeqWithRefHelper.h:28
Cigar::INDEX_NA
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition: Cigar.h:492
SamRecord::getFlag
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1372
SamSingleBaseMatchInfo::getQueryIndex
int32_t getQueryIndex()
Get the query index for this object.
Definition: SamQuerySeqWithRefHelper.cpp:224
SamSingleBaseMatchInfo::getType
Type getType()
Get the type (match/mismatch/unknown) for this object.
Definition: SamQuerySeqWithRefHelper.cpp:218
SamRecord::getSequence
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
Definition: SamRecord.cpp:1556
SamRecord::getReadLength
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1379
BaseUtilities::areEqual
static bool areEqual(char base1, char base2)
Returns whether or not two bases are equal (case insensitive), if one of the bases is '=',...
Definition: BaseUtilities.cpp:39
SamRecord
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:51
SamRecord::getCigarInfo
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1824
Cigar::getRefOffset
int32_t getRefOffset(int32_t queryIndex)
Return the reference offset associated with the specified query index or INDEX_NA based on this cigar...
Definition: Cigar.cpp:187
SamRecord::NONE
@ NONE
Leave the sequence as is.
Definition: SamRecord.h:58
SamSingleBaseMatchInfo::Type
Type
More types can be added later as needed.
Definition: SamQuerySeqWithRefHelper.h:32
SamQuerySeqWithRef::seqWithEquals
static void seqWithEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence with '=' in any position where the sequence matches the reference.
Definition: SamQuerySeqWithRefHelper.cpp:243
SamQuerySeqWithRef::seqWithoutEquals
static void seqWithoutEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence converting '=' to the appropriate base using the reference.
Definition: SamQuerySeqWithRefHelper.cpp:296
GenomeSequence::getGenomePosition
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
Definition: GenomeSequence.cpp:779