libStatGen Software 1
Loading...
Searching...
No Matches
FastQFile.h
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef __FASTQ_VALIDATOR_H__
19#define __FASTQ_VALIDATOR_H__
20
21#include <iostream>
22#include <map>
23#include "StringBasics.h"
24#include "InputFile.h"
25#include "BaseComposition.h"
26#include "FastQStatus.h"
27
28/// Class for reading/validating a fastq file.
30{
31 public:
32 /// Constructor.
33 /// /param minReadLength The minimum length that a base sequence must be for
34 /// it to be valid.
35 /// \param numPrintableErrors The maximum number of errors that should be reported
36 /// in detail before suppressing the errors.
37 FastQFile(int minReadLength = 10, int numPrintableErrors = 20);
38
39 /// Disable messages - do not write to cout.
40 void disableMessages();
41
42 /// Enable messages - write to cout.
43 void enableMessages();
44
45 /// Disable Unique Sequence ID checking
46 /// (Unique Sequence ID checking is enabled by default).
47 void disableSeqIDCheck();
48
49 /// Enable Unique Sequence ID checking.
50 /// (Unique Sequence ID checking is enabled by default).
51 void enableSeqIDCheck();
52
53 /// Interleaved.
54 void interleaved();
55
56 /// Set the number of errors after which to quit reading/validating a file,
57 /// defaults to -1.
58 /// \param maxErrors # of errors before quitting,
59 /// -1 indicates to not quit until the entire file has been read/validated (default),
60 /// 0 indicates to quit without reading/validating anything.
61 void setMaxErrors(int maxErrors);
62
63 /// Open a FastQFile.
64 /// Use the specified SPACE_TYPE to determine BASE, COLOR, or UNKNOWN.
65 FastQStatus::Status openFile(const char* fileName,
67
68 /// Close a FastQFile.
70
71 /// Check to see if the file is open.
72 bool isOpen();
73
74 /// Check to see if the file is at the end of the file.
75 bool isEof();
76
77 /// Returns whether or not to keep reading the file,
78 /// it stops reading (false) if eof or there is a problem reading the file.
79 bool keepReadingFile();
80
81 /// Validate the specified fastq file
82 /// \param filename fastq file to be validated.
83 /// \param printBaseComp whether or not to print the base composition for the file.
84 /// true means print it, false means do not.
85 /// \param spaceType the spaceType to use for validation - BASE_SPACE, COLOR_SPACE,
86 /// or UNKNOWN (UNKNOWN means to determine the spaceType to
87 /// validate against from the first character of the first
88 /// sequence).
89 /// \param printQualAvg whether or not to print the quality averages for the file.
90 /// true means to print it, false (default) means do not.
91 /// \return the fastq validation status, SUCCESS on a successfully
92 /// validated fastq file.
94 bool printBaseComp,
96 bool printQualAvg = false);
97
98 /// Read 1 FastQSequence, validating it.
100
101 ///////////////////////
102 /// @name Public Sequence Line variables.
103 /// Keep public variables for a sequence's line so they can be accessed
104 /// without having to do string copies.
105 //@{
106 String myRawSequence;
107 String mySequenceIdLine;
108 String mySequenceIdentifier;
109 String myPlusLine;
110 String myQualityString;
111 //@}
112
113 /// Get the space type used for this file.
115 {
116 return(myBaseComposition.getSpaceType());
117 }
118
119private:
120 // Validates a single fastq sequence from myFile.
121 bool validateFastQSequence();
122
123 // Reads and validates the sequence identifier line of a fastq sequence.
124 bool validateSequenceIdentifierLine();
125
126 // Reads and validates the raw sequence line(s) and the plus line. Both are
127 // included in one method since it is unknown when the raw sequence line
128 // ends until you find the plus line that divides it from the quality
129 // string. Since this method will read the plus line to know when the
130 // raw sequence ends, it also validates that line.
131 bool validateRawSequenceAndPlusLines();
132
133 // Reads and validates the quality string line(s).
134 bool validateQualityStringLines();
135
136 // Method to validate a line that contains part of the raw sequence.
137 // offset specifies where in the sequence to start validating.
138 bool validateRawSequence(int offset);
139
140 // Method to validate the "+" line that seperates the raw sequence and the
141 // quality string.
142 bool validateSequencePlus();
143
144 // Method to validate the quality string.
145 // offset specifies where in the quality string to start validating.
146 bool validateQualityString(int offset);
147
148 // Helper method to read a line from the input file into a string.
149 // It also tracks the line number.
150 void readLine();
151
152 // Helper method for printing the contents of myErrorString. It will
153 // only print the errors until the maximum number of reportable errors is
154 // reached.
155 void reportErrorOnLine();
156
157 // Reset the member data for each fastq file.
158 void reset();
159
160 // Reset the member data for each sequence.
161 void resetForEachSequence();
162
163 // Log the specified message if enabled.
164 void logMessage(const char* message);
165
166 // Determine if it is time to quit by checking if we are to quit after a
167 // certain number of errors and that many errors have been encountered.
168 bool isTimeToQuit();
169
170 void printAvgQual();
171
172 //////////////////////////////////////////////////////////////////////
173 // Following member data elements are reset for each validated sequence.
174 //
175
176 // Buffer for storing the contents of the line read.
177 // Stored as member data so memory allocation is only done once.
178 String myLineBuffer;
179
180 // Buffer for storing the error string. This prevents the reallocation of
181 // the string buffer for each error.
182 String myErrorString;
183
184 String myTempPartialQuality;
185
186 //////////////////////////////////////////////////////////////////////
187 // Following member data elements are reset for each validated file.
188 //
189 IFILE myFile; // Input file to be read.
190 String myFileName; // Name of file being processed.
191 int myNumErrors; // Tracks the number of errors.
192 unsigned int myLineNum; // Track the line number - used for reporting errors.
193 BaseComposition myBaseComposition; // Tracks the base composition.
194 std::vector<int> myQualPerCycle; // Tracks the quality by cycle.
195 std::vector<int> myCountPerCycle; // Tracks the number of entries by cycle.
196
197 // Whether or not to check the sequence identifier for uniqueness.
198 // Checking may use up a lot of memory.
199 bool myCheckSeqID;
200
201 // Whether or not to check that the file is interleaved.
202 // Disabled by myCheckSeqID
203 bool myInterleaved;
204
205 // Previous sequence id for checking interleaved.
206 std::string myPrevSeqID;
207
208 // Map to track which identifiers have appeared in the file.
209 std::map<std::string, unsigned int> myIdentifierMap;
210
211 //////////////////////////////////////////////////////////////////////
212 // Following member data do not change for each call to the validator.
213 //
214 int myMinReadLength; // Min Length for a read.
215 int myNumPrintableErrors; // Max number of errors to print the details of.
216
217 // Number of errors after which to quit reading/validating a file.
218 // Defaults to -1.
219 // -1 indicates to not quit until the entire file has been read/validated.
220 // 0 indicates to quit without reading/validating anything.
221 int myMaxErrors;
222
223 // Whether or not messages should be printed.
224 // Defaulted to false (they should be printed).
225 bool myDisableMessages;
226
227 // Track if there is a problem reading the file. If there are read
228 // problems, stop reading the file.
229 bool myFileProblem;
230};
231
232#endif
SPACE_TYPE
The type of space (color or base) to use in the mapping.
@ UNKNOWN
Base decision on the first raw seq character/type has yet to be determined.
Class that tracks the composition of base by read location.
BaseAsciiMap::SPACE_TYPE getSpaceType()
Get the space type for this composition.
Class for reading/validating a fastq file.
Definition FastQFile.h:30
void interleaved()
Interleaved.
Definition FastQFile.cpp:78
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
Definition FastQFile.cpp:92
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
Definition FastQFile.cpp:71
void disableMessages()
Disable messages - do not write to cout.
Definition FastQFile.cpp:49
bool isOpen()
Check to see if the file is open.
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
Definition FastQFile.cpp:63
BaseAsciiMap::SPACE_TYPE getSpaceType()
Get the space type used for this file.
Definition FastQFile.h:114
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
FastQStatus::Status closeFile()
Close a FastQFile.
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
void enableMessages()
Enable messages - write to cout.
Definition FastQFile.cpp:55
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1.
Definition FastQFile.cpp:85
bool isEof()
Check to see if the file is at the end of the file.
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
Definition FastQStatus.h:31
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition InputFile.h:37