libStatGen Software 1
Loading...
Searching...
No Matches
ReferenceSequence.h
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef __REFERENCESEQUENCE_H
19#define __REFERENCESEQUENCE_H
20
21#include "BaseAsciiMap.h"
22#include "Generic.h"
23#include "PackedVector.h"
24
25#include <algorithm>
26#include <iostream>
27
28//
29// The namespace Sequence is for templated algorithms that are by and large
30// independent of the underlying storage mechanism for the bases.
31//
32// They are written in such a way as to assume that the array operator []
33// will return an ASCII representation of a base space nucleotide.
34//
35// In theory, this set of templates will work with a variety of combinations
36// of means for representing bases - String, std::string, and others.
37//
38// The containers are expected to allow for an overidden [] operator,
39// and provide a size() method to return the number of bases in the
40// container.
41//
42// In containers where sequence data is placed, they must in addition
43// have a clear() method as well as a push_back() method as done
44// in std::string containers.
45//
46namespace Sequence {
47
48//
49// wordMatch(Sequnece, Index, Word) - compare a word to a sequence of bases
50// Sequence is a generic container with a large set of bases
51// Index is the starting point to start the comparison
52// Word is the small sequence of bases to match
53//
54template<typename sequenceType, typename sequenceIndexType, typename wordType>
55bool wordMatch(sequenceType &sequence, sequenceIndexType index, wordType &word)
56{
57 if( (index + word.size()) >= sequence.size() ) return false;
58
59 for(size_t i = 0; i < word.size(); i++) {
60 if( sequence[index + i] != word[i]) return false;
61 }
62 return true;
63}
64
65//
66// printNearbyWords(output, sequence, index, word, deviation) searches
67// for 'deviation' bases on either side of the index into sequence
68// and prints all occurrences where word appears.
69//
70template<typename sequenceType, typename sequenceIndexType, typename wordType>
71void printNearbyWords(std::ostream &output, sequenceType &sequence, sequenceIndexType index, wordType &word, int deviation)
72{
73 for (sequenceIndexType i = index - deviation; i < index + deviation; i++)
74 {
75 if (wordMatch(sequence, i, word))
76 {
77 output << "word '"
78 << word
79 << "' found "
80 << i - index
81 << " away from position "
82 << index
83 << "."
84 << std::endl;
85 }
86 }
87}
88
89//
90// getString(sequence, index, baseCount, word) - populate word with the 'baseCount'
91// bases that occur at the 'index' starting position in sequence.
92//
93template<typename sequenceType, typename sequenceIndexType, typename wordType>
94void getString(sequenceType &sequence, sequenceIndexType index, int baseCount, wordType &word)
95{
96 word.clear();
97 for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++)
98 {
99 word.push_back(sequence[index + i]);
100 }
101}
102
103//
104// getHighLightedString() is a debugging aid for printing "highlighted"
105// subsets of bases, where the highlighting is done via turning the
106// base into a lower case ASCII equivalent.
107//
108template<typename sequenceType, typename sequenceIndexType, typename wordType>
109void getHighLightedString(
110 sequenceType &sequence,
111 sequenceIndexType index,
112 int baseCount,
113 wordType &word,
114 sequenceIndexType highLightStart,
115 sequenceIndexType highLightEnd)
116{
117 word.clear();
118 for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++)
119 {
120 char base = sequence[index+i];
121
122 if(in(index+i, highLightStart, highLightEnd))
123 base = tolower(base);
124
125 word.push_back(base);
126 }
127}
128
129//
130// printBaseContext() outputs a base at location 'index' along with 'baseCount'
131// bases on either side of that base (default 30).
132//
133template<typename sequenceType, typename sequenceIndexType>
134void printBaseContext(std::ostream &output, sequenceType &sequence, sequenceIndexType index, int baseCount = 30)
135{
136 output << "index: " << index << std::endl;
137 for (sequenceIndexType i=index-baseCount; i<=index+baseCount; i++)
138 output << sequence[i];
139 output << std::endl;
140 for (sequenceIndexType i=index-baseCount; i<index; i++)
141 std::cout << " ";
142 output << "^";
143 output << std::endl;
144}
145
146template<typename sequenceType, typename sequenceIndexType>
147void getMismatchHatString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read)
148{
149 result = "";
150 for (uint32_t i=0; i < read.size(); i++)
151 {
152 if (read[i] == sequence[location+i])
153 result.push_back(' ');
154 else
155 result.push_back('^');
156 }
157}
158
159template<typename sequenceType, typename sequenceIndexType>
160void getMismatchString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read)
161{
162 result = "";
163 for (uint32_t i=0; i < read.size(); i++)
164 {
165 if (read[i] == sequence[location+i])
166 result.push_back(toupper(read[i]));
167 else
168 result.push_back(tolower(read[i]));
169 }
170}
171
172/// Return the mismatch count, disregarding CIGAR strings
173///
174/// \param read is the sequence we're counting mismatches in
175/// \param location is where in the genmoe we start comparing
176/// \param exclude is a wildcard character (e.g. '.' or 'N')
177///
178/// \return number of bases that don't match the reference, except those that match exclude
179
180template<typename sequenceType, typename sequenceIndexType, typename readType>
181int getMismatchCount(sequenceType &sequence, sequenceIndexType location, readType &read, char exclude='\0')
182{
183 int mismatchCount = 0;
184 for (uint32_t i=0; i<read.size(); i++)
185 if (read[i]!=exclude) mismatchCount += read[i]!=sequence[location + i];
186 return mismatchCount;
187}
188
189/// brute force sumQ - no sanity checking
190///
191/// \param read shotgun sequencer read string
192/// \param qualities phred quality string of same length
193/// \param location the alignment location to check sumQ
194template<typename sequenceType, typename sequenceIndexType, typename readType, typename qualityType>
195int getSumQ(sequenceType &sequence, sequenceIndexType location, readType &read, qualityType &qualities)
196{
197 int sumQ = 0;
198 for (uint32_t i=0; i<read.size(); i++)
199 sumQ += (read[i]!=sequence[location + i] ? (qualities[i]-33) : 0);
200 return sumQ;
201};
202
203
204
205template<typename sequenceType, typename sequenceIndexType, typename readType, typename qualityType>
206sequenceIndexType simpleLocalAligner(
207 sequenceType &sequence,
208 sequenceIndexType index,
209 readType &read,
210 qualityType &quality,
211 int windowSize)
212{
213 int bestScore = 1000000; // either mismatch count or sumQ
214 sequenceIndexType bestMatchLocation = -1;
215 for (int i=-windowSize; i<windowSize; i++)
216 {
217 int newScore;
218
219 // 'in' is a template, and types of all three args have to match
220 if( not in((size_t) index + i, (size_t) 0, sequence.size())) continue;
221
222 if (quality.size() == 0)
223 {
224 newScore = getMismatchCount(sequence, index + i, read);
225 }
226 else
227 {
228 newScore = getSumQ(sequence, index + i, read, quality);
229 }
230 if (newScore < bestScore)
231 {
232 bestScore = newScore;
233 bestMatchLocation = index + i;
234 }
235 }
236 return bestMatchLocation;
237}
238
239
240}
241
242typedef uint32_t PackedVectorIndex_t;
243
244//
245// this is actually a base space implementation, since
246// the load/set methods do indirection from the symbol
247// value through a symbol value to symbol lookup table.
248//
250{
251
252 public:
253 inline char operator[](PackedVectorIndex_t baseIndex)
254 {
255 return BaseAsciiMap::int2base[ (static_cast<PackedVector4Bit_t>(*this))[baseIndex]];
256 }
257 inline void set(PackedVectorIndex_t baseIndex, char value)
258 {
259 this->PackedVector4Bit_t::set(baseIndex, BaseAsciiMap::base2int[(uint32_t) value]);
260 }
261 inline void push_back(char value)
262 {
263 this->PackedVector4Bit_t::push_back(BaseAsciiMap::base2int[(uint32_t) value]);
264 }
265};
266
267std::ostream &operator << (std::ostream &stream, PackedSequenceData &v)
268{
269 for(size_t i=0; i<v.size(); i++) {
270 stream << i << ": " << v[i] << std::endl;
271 }
272 return stream;
273}
274
275
276//
277// Load a fasta format file from filename into the buffer
278// provided by the caller.
279// While parsing the fasta file, record each chromosome name,
280// its start location, and its size.
281//
282// NB: the caller must implement the logic to determine how
283// large the sequence data is. There is no correct way to do
284// this, because we can't reliably estimate here how much sequence
285// data is contained in a compressed file.
286//
287// To safely pre-allocate space in sequenceData, use the reserve() method
288// before calling this function.
289//
290template<typename SequenceDataType, typename ChromosomeNameType>
291bool loadFastaFile(const char *filename,
292 std::vector<SequenceDataType> &sequenceData,
293 std::vector<ChromosomeNameType> &chromosomeNames)
294{
295 InputFile inputStream(filename, "r", InputFile::DEFAULT);
296
297 if(!inputStream.isOpen()) {
298 std::cerr << "Failed to open file " << filename << "\n";
299 return true;
300 }
301
302 int whichChromosome = -1;
303
304 //
305 // chromosomeNames is cheap to clear, so do it here.
306 //
307 // NB: I explicitly choose not to clear the sequence data
308 // container, this allows the caller to pre-allocate based
309 // on their knowledge of the size of the expected genome.
310 //
311
312 chromosomeNames.clear();
313
314 char ch;
315 while((ch = inputStream.ifgetc()) != EOF) {
316 switch (ch)
317 {
318 case '\n':
319 case '\r':
320 break;
321 case '>':
322 {
323 std::string chromosomeName = "";
324 //
325 // pull out the chromosome new name
326 //
327 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
328 {
329 chromosomeName += ch; // slow, but who cares
330 }
331 //
332 // eat the rest of the line
333 //
334 do {
335 ch = inputStream.ifgetc();
336 } while(ch != EOF && ch != '\n' && ch != '\r');
337 //
338 // save the Chromosome name and index into our
339 // header so we can use them later.
340 //
341 chromosomeNames.push_back(chromosomeName);
342
343 whichChromosome++;
344
345 sequenceData.resize(whichChromosome+1);
346
347 break;
348 }
349 default:
350 // we get here for sequence data.
351 //
352 // save the base value
353 // Note: invalid characters come here as well, but we
354 // let ::set deal with mapping them.
355
356 sequenceData[whichChromosome].push_back(toupper(ch));
357#if 0
358 if (isColorSpace())
359 {
360//
361// anything outside these values represents an invalid base
362// base codes: 0-> A, 1-> C, 2-> G, 3-> T
363// colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
364//
365 const char fromBase2CS[] =
366 {
367 /* 0000 */ 0, // A->A
368 /* 0001 */ 1, // A->C
369 /* 0010 */ 2, // A->G
370 /* 0011 */ 3, // A->T
371 /* 0100 */ 1, // C->A
372 /* 0101 */ 0, // C->C
373 /* 0110 */ 3, // C->G
374 /* 0111 */ 2, // C->T
375 /* 1000 */ 2, // G->A
376 /* 1001 */ 3, // G->C
377 /* 1010 */ 0, // G->G
378 /* 1011 */ 1, // G->T
379 /* 1100 */ 3, // T->A
380 /* 1101 */ 2, // T->C
381 /* 1110 */ 1, // T->G
382 /* 1111 */ 0, // T->T
383 };
384 //
385 // we are writing color space values on transitions,
386 // so we don't write a colorspace value when we
387 // get the first base value.
388 //
389 // On second and subsequent bases, write based on
390 // the index table above
391 //
392 char thisBase = base2int[(int)(fasta[fastaIndex])];
393 if (lastBase>=0)
394 {
395 char color;
396 if (lastBase>3 || thisBase>3) color=4;
397 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
398 // re-use the int to base, because ::set expects a base char (ATCG), not
399 // a color code (0123). It should only matter on final output.
400 set(header->elementCount++, int2base[(int) color]);
401 }
402 lastBase = thisBase;
403 }
404 else
405 {
406 set(header->elementCount++, toupper(fasta[fastaIndex]));
407 }
408#endif
409 break;
410 }
411 }
412 return false;
413}
414
415#endif
static unsigned char base2int[256+1]
Map ASCII values to a 2 (or 3) bit encoding for the base pair value for just base space (ACTGNactgn).
static const char int2base[]
Convert from int representation to the base.
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition InputFile.h:37
@ DEFAULT
Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED.
Definition InputFile.h:45