22#include "BaseUtilities.h"
38 myMinReadLength(minReadLength),
39 myNumPrintableErrors(numPrintableErrors),
41 myDisableMessages(false),
51 myDisableMessages =
true;
57 myDisableMessages =
false;
87 myMaxErrors = maxErrors;
100 myQualPerCycle.clear();
101 myCountPerCycle.clear();
112 myFile =
ifopen(fileName,
"rt");
113 myFileName = fileName;
125 std::string errorMessage =
"ERROR: Failed to open file: ";
126 errorMessage += fileName;
127 logMessage(errorMessage.c_str());
153 std::string errorMessage =
"Failed to close file: ";
154 errorMessage += myFileName.c_str();
155 logMessage(errorMessage.c_str());
165 if((myFile != NULL) && (myFile->
isOpen()))
180 if((myFile != NULL) && (
ifeof(myFile)))
195 if(
isEof() || myFileProblem)
217 int numSequences = 0;
225 ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
246 myBaseComposition.
print();
254 std::string finishMessage =
"Finished processing ";
255 finishMessage += myFileName.c_str();
258 " with %u lines containing %d sequences.",
259 myLineNum, numSequences) > 0)
261 finishMessage += buffer;
262 logMessage(finishMessage.c_str());
265 "There were a total of %d errors.",
281 else if(myNumErrors == 0)
286 if(numSequences == 0)
289 logMessage(
"ERROR: No FastQSequences in the file.");
314 std::string message =
315 "ERROR: Trying to read a fastq file but no file is open.";
316 logMessage(message.c_str());
321 resetForEachSequence();
332 valid = validateSequenceIdentifierLine();
345 if(mySequenceIdLine.Length() != 0)
349 myErrorString =
"Incomplete Sequence.\n";
374 valid &= validateRawSequenceAndPlusLines();
392 myErrorString =
"Incomplete Sequence, missing Quality String.";
399 valid &= validateQualityStringLines();
415bool FastQFile::validateSequenceIdentifierLine()
418 int readStatus = mySequenceIdLine.ReadLine(myFile);
428 myFileProblem =
true;
429 myErrorString =
"Failure trying to read sequence identifier line";
436 if((mySequenceIdLine.Length() == 0) && (
ifeof(myFile)))
447 if(mySequenceIdLine.Length() < 2)
450 myErrorString =
"The sequence identifier line was too short.";
456 if(mySequenceIdLine[0] !=
'@')
459 myErrorString =
"First line of a sequence does not begin with @";
470 int endSequenceIdentifier = mySequenceIdLine.FastFindChar(
' ', 1);
473 if(endSequenceIdentifier == -1)
477 mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
484 if(endSequenceIdentifier <= 1)
487 "No Sequence Identifier specified before the comment.";
492 mySequenceIdentifier =
493 (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
497 if(myInterleaved && (myPrevSeqID !=
""))
501 if(myPrevSeqID.compare(mySequenceIdentifier) != 0)
504 if((myPrevSeqID.compare(0, myPrevSeqID.length()-1, mySequenceIdentifier.c_str(), mySequenceIdentifier.Length()-1) != 0) ||
505 (((myPrevSeqID[myPrevSeqID.length()-1] !=
'1') || (mySequenceIdentifier[mySequenceIdentifier.Length()-1] !=
'2')) &&
506 (myPrevSeqID[myPrevSeqID.length()-1] != mySequenceIdentifier[mySequenceIdentifier.Length()-1])))
508 myErrorString =
"Interleaved: consecutive reads do not have matching sequence identifiers: ";
509 myErrorString += mySequenceIdentifier.c_str();
510 myErrorString +=
" and ";
511 myErrorString += myPrevSeqID.c_str();
523 myPrevSeqID = mySequenceIdentifier.c_str();
532 std::pair<std::map<std::string, unsigned int>::iterator,
bool> insertResult;
534 myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
537 if(insertResult.second ==
false)
540 myErrorString =
"Repeated Sequence Identifier: ";
541 myErrorString += mySequenceIdentifier.c_str();
542 myErrorString +=
" at Lines ";
543 myErrorString += insertResult.first->second;
544 myErrorString +=
" and ";
545 myErrorString += myLineNum;
562bool FastQFile::validateRawSequenceAndPlusLines()
565 int readStatus = myRawSequence.ReadLine(myFile);
571 myFileProblem =
true;
572 myErrorString =
"Failure trying to read sequence line";
581 bool valid = validateRawSequence(offset);
584 offset = myRawSequence.Length();
589 bool stillRawLine =
true;
590 while(stillRawLine &&
602 readStatus = myPlusLine.ReadLine(myFile);
607 myFileProblem =
true;
608 myErrorString =
"Failure trying to read sequence/plus line";
614 if(myPlusLine.Length() == 0)
619 "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
623 else if(myPlusLine[0] ==
'+')
626 valid &= validateSequencePlus();
627 stillRawLine =
false;
634 myRawSequence += myPlusLine;
635 myPlusLine.SetLength(0);
636 valid &= validateRawSequence(offset);
639 offset = myRawSequence.Length();
651 if(myRawSequence.Length() < myMinReadLength)
654 myErrorString =
"Raw Sequence is shorter than the min read length: ";
655 myErrorString += myRawSequence.Length();
656 myErrorString +=
" < ";
657 myErrorString += myMinReadLength;
672 myErrorString =
"Reached the end of the file without a '+' line.";
682bool FastQFile::validateQualityStringLines()
685 int readStatus = myQualityString.ReadLine(myFile);
690 myFileProblem =
true;
691 myErrorString =
"Failure trying to read quality line";
700 bool valid = validateQualityString(offset);
702 offset = myQualityString.Length();
706 while((myQualityString.Length() < myRawSequence.Length()) &&
716 readStatus = myTempPartialQuality.ReadLine(myFile);
721 myFileProblem =
true;
722 myErrorString =
"Failure trying to read quality line";
727 myQualityString += myTempPartialQuality;
728 myTempPartialQuality.Clear();
731 valid &= validateQualityString(offset);
732 offset = myQualityString.Length();
743 if(myQualityString.Length() != myRawSequence.Length())
745 myErrorString =
"Quality string length (";
746 myErrorString += myQualityString.Length();
747 myErrorString +=
") does not equal raw sequence length (";
748 myErrorString += myRawSequence.Length();
749 myErrorString +=
")";
758bool FastQFile::validateRawSequence(
int offset)
760 bool validBase =
false;
764 for(
int sequenceIndex = offset; sequenceIndex < myRawSequence.Length();
771 myRawSequence[sequenceIndex]);
776 myErrorString =
"Invalid character ('";
777 myErrorString += myRawSequence[sequenceIndex];
778 myErrorString +=
"') in base sequence.";
794bool FastQFile::validateSequencePlus()
800 int lineLength = myPlusLine.Length();
805 if((lineLength == 1) || (myPlusLine[1] ==
' '))
817 int sequenceIdentifierLength = mySequenceIdentifier.Length();
818 if(lineLength <= sequenceIdentifierLength)
821 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
833 while((same ==
true) && (seqIndex < sequenceIdentifierLength))
835 if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
838 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
850bool FastQFile::validateQualityString(
int offset)
853 if(myQualityString.Length() > (
int)(myQualPerCycle.size()))
855 myQualPerCycle.resize(myQualityString.Length());
856 myCountPerCycle.resize(myQualityString.Length());
859 for(
int i=offset; i < myQualityString.Length(); i++)
861 if(myQualityString[i] <= 32)
863 myErrorString =
"Invalid character ('";
864 myErrorString += myQualityString[i];
865 myErrorString +=
"') in quality string.";
877 myCountPerCycle[i] += 1;
887void FastQFile::reportErrorOnLine()
893 if(myNumErrors <= myNumPrintableErrors)
897 sprintf(buffer,
"ERROR on Line %u: ", myLineNum);
898 std::string message = buffer;
899 message += myErrorString.c_str();
900 logMessage(message.c_str());
906void FastQFile::reset()
910 resetForEachSequence();
913 myFileName.SetLength(0);
914 myIdentifierMap.clear();
915 myBaseComposition.
clear();
916 myQualPerCycle.clear();
917 myCountPerCycle.clear();
918 myFileProblem =
false;
923void FastQFile::resetForEachSequence()
925 myLineBuffer.SetLength(0);
926 myErrorString.SetLength(0);
927 myRawSequence.SetLength(0);
928 mySequenceIdLine.SetLength(0);
929 mySequenceIdentifier.SetLength(0);
930 myPlusLine.SetLength(0);
931 myQualityString.SetLength(0);
932 myTempPartialQuality.SetLength(0);
936void FastQFile::logMessage(
const char* logMessage)
939 if(!myDisableMessages)
941 std::cout << logMessage << std::endl;
948bool FastQFile::isTimeToQuit()
952 if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
960void FastQFile::printAvgQual()
962 std::cout << std::endl <<
"Average Phred Quality by Read Index (starts at 0):" << std::endl;
963 std::cout.precision(2);
964 std::cout << std::fixed <<
"Read Index\tAverage Quality"
966 if(myQualPerCycle.size() != myCountPerCycle.size())
969 std::cerr <<
"ERROR calculating the average Qualities per cycle\n";
975 for(
unsigned int i = 0; i < myQualPerCycle.size(); i++)
978 if(myCountPerCycle[i] != 0)
980 avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]);
982 std::cout << i <<
"\t" << avgQual <<
"\n";
983 sumQual += myQualPerCycle[i];
984 count += myCountPerCycle[i];
986 std::cout << std::endl;
990 avgQual = sumQual / count;
992 std::cout <<
"Overall Average Phred Quality = " << avgQual << std::endl;
SPACE_TYPE
The type of space (color or base) to use in the mapping.
bool updateComposition(unsigned int rawSequenceCharIndex, char baseChar)
Update the composition for the specified index with the specified character.
void clear()
Clear the composition stored in the base count vector.
void setBaseMapType(BaseAsciiMap::SPACE_TYPE spaceType)
Set the base map type for this composition.
void print()
Print the composition.
void resetBaseMapType()
Reset the base map type for this composition.
static uint8_t getPhredBaseQuality(char charQuality)
Get phred base quality from the specified ascii quality.
void interleaved()
Interleaved.
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
void disableMessages()
Disable messages - do not write to cout.
bool isOpen()
Check to see if the file is open.
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
FastQStatus::Status closeFile()
Close a FastQFile.
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
void enableMessages()
Enable messages - write to cout.
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1.
FastQFile(int minReadLength=10, int numPrintableErrors=20)
Constructor.
bool isEof()
Check to see if the file is at the end of the file.
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
@ FASTQ_ORDER_ERROR
means the methods are called out of order, like trying to read a file before opening it.
@ FASTQ_READ_ERROR
means that a problem occurred on a read.
@ FASTQ_SUCCESS
indicates method finished successfully.
@ FASTQ_INVALID
means that the sequence was invalid.
@ FASTQ_OPEN_ERROR
means the file could not be opened.
@ FASTQ_NO_SEQUENCE_ERROR
means there were no errors, but no sequences read.
@ FASTQ_CLOSE_ERROR
means the file could not be closed.