libStatGen Software  1
FastQFile.h
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef __FASTQ_VALIDATOR_H__
19 #define __FASTQ_VALIDATOR_H__
20 
21 #include <iostream>
22 #include <map>
23 #include "StringBasics.h"
24 #include "InputFile.h"
25 #include "BaseComposition.h"
26 #include "FastQStatus.h"
27 
28 /// Class for reading/validating a fastq file.
29 class FastQFile
30 {
31  public:
32  /// Constructor.
33  /// /param minReadLength The minimum length that a base sequence must be for
34  /// it to be valid.
35  /// \param numPrintableErrors The maximum number of errors that should be reported
36  /// in detail before suppressing the errors.
37  FastQFile(int minReadLength = 10, int numPrintableErrors = 20);
38 
39  /// Disable messages - do not write to cout.
40  void disableMessages();
41 
42  /// Enable messages - write to cout.
43  void enableMessages();
44 
45  /// Disable Unique Sequence ID checking
46  /// (Unique Sequence ID checking is enabled by default).
47  void disableSeqIDCheck();
48 
49  /// Enable Unique Sequence ID checking.
50  /// (Unique Sequence ID checking is enabled by default).
51  void enableSeqIDCheck();
52 
53  /// Set the number of errors after which to quit reading/validating a file,
54  /// defaults to -1.
55  /// \param maxErrors # of errors before quitting,
56  /// -1 indicates to not quit until the entire file has been read/validated (default),
57  /// 0 indicates to quit without reading/validating anything.
58  void setMaxErrors(int maxErrors);
59 
60  /// Open a FastQFile.
61  /// Use the specified SPACE_TYPE to determine BASE, COLOR, or UNKNOWN.
62  FastQStatus::Status openFile(const char* fileName,
64 
65  /// Close a FastQFile.
67 
68  /// Check to see if the file is open.
69  bool isOpen();
70 
71  /// Check to see if the file is at the end of the file.
72  bool isEof();
73 
74  /// Returns whether or not to keep reading the file,
75  /// it stops reading (false) if eof or there is a problem reading the file.
76  bool keepReadingFile();
77 
78  /// Validate the specified fastq file
79  /// \param filename fastq file to be validated.
80  /// \param printBaseComp whether or not to print the base composition for the file.
81  /// true means print it, false means do not.
82  /// \param spaceType the spaceType to use for validation - BASE_SPACE, COLOR_SPACE,
83  /// or UNKNOWN (UNKNOWN means to determine the spaceType to
84  /// validate against from the first character of the first
85  /// sequence).
86  /// \param printQualAvg whether or not to print the quality averages for the file.
87  /// true means to print it, false (default) means do not.
88  /// \return the fastq validation status, SUCCESS on a successfully
89  /// validated fastq file.
91  bool printBaseComp,
92  BaseAsciiMap::SPACE_TYPE spaceType,
93  bool printQualAvg = false);
94 
95  /// Read 1 FastQSequence, validating it.
97 
98  ///////////////////////
99  /// @name Public Sequence Line variables.
100  /// Keep public variables for a sequence's line so they can be accessed
101  /// without having to do string copies.
102  //@{
103  String myRawSequence;
104  String mySequenceIdLine;
105  String mySequenceIdentifier;
106  String myPlusLine;
107  String myQualityString;
108  //@}
109 
110  /// Get the space type used for this file.
112  {
113  return(myBaseComposition.getSpaceType());
114  }
115 
116 private:
117  // Validates a single fastq sequence from myFile.
118  bool validateFastQSequence();
119 
120  // Reads and validates the sequence identifier line of a fastq sequence.
121  bool validateSequenceIdentifierLine();
122 
123  // Reads and validates the raw sequence line(s) and the plus line. Both are
124  // included in one method since it is unknown when the raw sequence line
125  // ends until you find the plus line that divides it from the quality
126  // string. Since this method will read the plus line to know when the
127  // raw sequence ends, it also validates that line.
128  bool validateRawSequenceAndPlusLines();
129 
130  // Reads and validates the quality string line(s).
131  bool validateQualityStringLines();
132 
133  // Method to validate a line that contains part of the raw sequence.
134  // offset specifies where in the sequence to start validating.
135  bool validateRawSequence(int offset);
136 
137  // Method to validate the "+" line that seperates the raw sequence and the
138  // quality string.
139  bool validateSequencePlus();
140 
141  // Method to validate the quality string.
142  // offset specifies where in the quality string to start validating.
143  bool validateQualityString(int offset);
144 
145  // Helper method to read a line from the input file into a string.
146  // It also tracks the line number.
147  void readLine();
148 
149  // Helper method for printing the contents of myErrorString. It will
150  // only print the errors until the maximum number of reportable errors is
151  // reached.
152  void reportErrorOnLine();
153 
154  // Reset the member data for each fastq file.
155  void reset();
156 
157  // Reset the member data for each sequence.
158  void resetForEachSequence();
159 
160  // Log the specified message if enabled.
161  void logMessage(const char* message);
162 
163  // Determine if it is time to quit by checking if we are to quit after a
164  // certain number of errors and that many errors have been encountered.
165  bool isTimeToQuit();
166 
167  void printAvgQual();
168 
169  //////////////////////////////////////////////////////////////////////
170  // Following member data elements are reset for each validated sequence.
171  //
172 
173  // Buffer for storing the contents of the line read.
174  // Stored as member data so memory allocation is only done once.
175  String myLineBuffer;
176 
177  // Buffer for storing the error string. This prevents the reallocation of
178  // the string buffer for each error.
179  String myErrorString;
180 
181  String myTempPartialQuality;
182 
183  //////////////////////////////////////////////////////////////////////
184  // Following member data elements are reset for each validated file.
185  //
186  IFILE myFile; // Input file to be read.
187  String myFileName; // Name of file being processed.
188  int myNumErrors; // Tracks the number of errors.
189  unsigned int myLineNum; // Track the line number - used for reporting errors.
190  BaseComposition myBaseComposition; // Tracks the base composition.
191  std::vector<int> myQualPerCycle; // Tracks the quality by cycle.
192  std::vector<int> myCountPerCycle; // Tracks the number of entries by cycle.
193 
194  // Whether or not to check the sequence identifier for uniqueness.
195  // Checking may use up a lot of memory.
196  bool myCheckSeqID;
197 
198  // Map to track which identifiers have appeared in the file.
199  std::map<std::string, unsigned int> myIdentifierMap;
200 
201  //////////////////////////////////////////////////////////////////////
202  // Following member data do not change for each call to the validator.
203  //
204  int myMinReadLength; // Min Length for a read.
205  int myNumPrintableErrors; // Max number of errors to print the details of.
206 
207  // Number of errors after which to quit reading/validating a file.
208  // Defaults to -1.
209  // -1 indicates to not quit until the entire file has been read/validated.
210  // 0 indicates to quit without reading/validating anything.
211  int myMaxErrors;
212 
213  // Whether or not messages should be printed.
214  // Defaulted to false (they should be printed).
215  bool myDisableMessages;
216 
217  // Track if there is a problem reading the file. If there are read
218  // problems, stop reading the file.
219  bool myFileProblem;
220 };
221 
222 #endif
FastQFile::openFile
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
Definition: FastQFile.cpp:83
FastQFile::isOpen
bool isOpen()
Check to see if the file is open.
Definition: FastQFile.cpp:153
FastQFile::FastQFile
FastQFile(int minReadLength=10, int numPrintableErrors=20)
Constructor.
Definition: FastQFile.cpp:30
FastQFile::disableSeqIDCheck
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
Definition: FastQFile.cpp:61
BaseAsciiMap::UNKNOWN
@ UNKNOWN
Base decision on the first raw seq character/type has yet to be determined.
Definition: BaseAsciiMap.h:47
String
Definition: StringBasics.h:38
FastQFile::enableMessages
void enableMessages()
Enable messages - write to cout.
Definition: FastQFile.cpp:53
FastQFile::disableMessages
void disableMessages()
Disable messages - do not write to cout.
Definition: FastQFile.cpp:47
BaseComposition::getSpaceType
BaseAsciiMap::SPACE_TYPE getSpaceType()
Get the space type for this composition.
Definition: BaseComposition.h:40
BaseAsciiMap::SPACE_TYPE
SPACE_TYPE
The type of space (color or base) to use in the mapping.
Definition: BaseAsciiMap.h:44
FastQFile::closeFile
FastQStatus::Status closeFile()
Close a FastQFile.
Definition: FastQFile.cpp:125
BaseComposition
Class that tracks the composition of base by read location.
Definition: BaseComposition.h:27
FastQFile::keepReadingFile
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
Definition: FastQFile.cpp:184
InputFile.h
InputFile
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:36
FastQFile
Class for reading/validating a fastq file.
Definition: FastQFile.h:29
FastQFile::setMaxErrors
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1.
Definition: FastQFile.cpp:76
FastQFile::getSpaceType
BaseAsciiMap::SPACE_TYPE getSpaceType()
Get the space type used for this file.
Definition: FastQFile.h:111
FastQFile::readFastQSequence
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
Definition: FastQFile.cpp:299
FastQFile::validateFastQFile
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
Definition: FastQFile.cpp:195
FastQFile::enableSeqIDCheck
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
Definition: FastQFile.cpp:69
FastQStatus::Status
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
Definition: FastQStatus.h:30
FastQFile::isEof
bool isEof()
Check to see if the file is at the end of the file.
Definition: FastQFile.cpp:168