21 #include "FastQFile.h"
22 #include "BaseUtilities.h"
36 myMinReadLength(minReadLength),
37 myNumPrintableErrors(numPrintableErrors),
39 myDisableMessages(false),
49 myDisableMessages =
true;
55 myDisableMessages =
false;
78 myMaxErrors = maxErrors;
91 myQualPerCycle.clear();
92 myCountPerCycle.clear();
103 myFile =
ifopen(fileName,
"rt");
104 myFileName = fileName;
116 std::string errorMessage =
"ERROR: Failed to open file: ";
117 errorMessage += fileName;
118 logMessage(errorMessage.c_str());
144 std::string errorMessage =
"Failed to close file: ";
145 errorMessage += myFileName.c_str();
146 logMessage(errorMessage.c_str());
156 if((myFile != NULL) && (myFile->
isOpen()))
171 if((myFile != NULL) && (
ifeof(myFile)))
186 if(
isEof() || myFileProblem)
208 int numSequences = 0;
216 ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
237 myBaseComposition.
print();
245 std::string finishMessage =
"Finished processing ";
246 finishMessage += myFileName.c_str();
249 " with %u lines containing %d sequences.",
250 myLineNum, numSequences) > 0)
252 finishMessage += buffer;
253 logMessage(finishMessage.c_str());
256 "There were a total of %d errors.",
271 else if(myNumErrors == 0)
276 if(numSequences == 0)
279 logMessage(
"ERROR: No FastQSequences in the file.");
304 std::string message =
305 "ERROR: Trying to read a fastq file but no file is open.";
306 logMessage(message.c_str());
311 resetForEachSequence();
322 valid = validateSequenceIdentifierLine();
335 if(mySequenceIdLine.Length() != 0)
339 myErrorString =
"Incomplete Sequence.\n";
364 valid &= validateRawSequenceAndPlusLines();
382 myErrorString =
"Incomplete Sequence, missing Quality String.";
389 valid &= validateQualityStringLines();
405 bool FastQFile::validateSequenceIdentifierLine()
408 int readStatus = mySequenceIdLine.ReadLine(myFile);
418 myFileProblem =
true;
419 myErrorString =
"Failure trying to read sequence identifier line";
426 if((mySequenceIdLine.Length() == 0) && (
ifeof(myFile)))
437 if(mySequenceIdLine.Length() < 2)
440 myErrorString =
"The sequence identifier line was too short.";
446 if(mySequenceIdLine[0] !=
'@')
449 myErrorString =
"First line of a sequence does not begin with @";
460 int endSequenceIdentifier = mySequenceIdLine.FastFindChar(
' ', 1);
463 if(endSequenceIdentifier == -1)
467 mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
474 if(endSequenceIdentifier <= 1)
477 "No Sequence Identifier specified before the comment.";
482 mySequenceIdentifier =
483 (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
491 std::pair<std::map<std::string, unsigned int>::iterator,
bool> insertResult;
493 myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
496 if(insertResult.second ==
false)
499 myErrorString =
"Repeated Sequence Identifier: ";
500 myErrorString += mySequenceIdentifier.c_str();
501 myErrorString +=
" at Lines ";
502 myErrorString += insertResult.first->second;
503 myErrorString +=
" and ";
504 myErrorString += myLineNum;
520 bool FastQFile::validateRawSequenceAndPlusLines()
523 int readStatus = myRawSequence.ReadLine(myFile);
529 myFileProblem =
true;
530 myErrorString =
"Failure trying to read sequence line";
539 bool valid = validateRawSequence(offset);
542 offset = myRawSequence.Length();
547 bool stillRawLine =
true;
548 while(stillRawLine &&
560 readStatus = myPlusLine.ReadLine(myFile);
565 myFileProblem =
true;
566 myErrorString =
"Failure trying to read sequence/plus line";
572 if(myPlusLine.Length() == 0)
577 "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
581 else if(myPlusLine[0] ==
'+')
584 valid &= validateSequencePlus();
585 stillRawLine =
false;
592 myRawSequence += myPlusLine;
593 myPlusLine.SetLength(0);
594 valid &= validateRawSequence(offset);
597 offset = myRawSequence.Length();
609 if(myRawSequence.Length() < myMinReadLength)
612 myErrorString =
"Raw Sequence is shorter than the min read length: ";
613 myErrorString += myRawSequence.Length();
614 myErrorString +=
" < ";
615 myErrorString += myMinReadLength;
630 myErrorString =
"Reached the end of the file without a '+' line.";
640 bool FastQFile::validateQualityStringLines()
643 int readStatus = myQualityString.ReadLine(myFile);
648 myFileProblem =
true;
649 myErrorString =
"Failure trying to read quality line";
658 bool valid = validateQualityString(offset);
660 offset = myQualityString.Length();
664 while((myQualityString.Length() < myRawSequence.Length()) &&
674 readStatus = myTempPartialQuality.ReadLine(myFile);
679 myFileProblem =
true;
680 myErrorString =
"Failure trying to read quality line";
685 myQualityString += myTempPartialQuality;
686 myTempPartialQuality.Clear();
689 valid &= validateQualityString(offset);
690 offset = myQualityString.Length();
701 if(myQualityString.Length() != myRawSequence.Length())
703 myErrorString =
"Quality string length (";
704 myErrorString += myQualityString.Length();
705 myErrorString +=
") does not equal raw sequence length (";
706 myErrorString += myRawSequence.Length();
707 myErrorString +=
")";
716 bool FastQFile::validateRawSequence(
int offset)
718 bool validBase =
false;
722 for(
int sequenceIndex = offset; sequenceIndex < myRawSequence.Length();
729 myRawSequence[sequenceIndex]);
734 myErrorString =
"Invalid character ('";
735 myErrorString += myRawSequence[sequenceIndex];
736 myErrorString +=
"') in base sequence.";
752 bool FastQFile::validateSequencePlus()
758 int lineLength = myPlusLine.Length();
763 if((lineLength == 1) || (myPlusLine[1] ==
' '))
775 int sequenceIdentifierLength = mySequenceIdentifier.Length();
776 if(lineLength <= sequenceIdentifierLength)
779 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
791 while((same ==
true) && (seqIndex < sequenceIdentifierLength))
793 if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
796 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
808 bool FastQFile::validateQualityString(
int offset)
811 if(myQualityString.Length() > (
int)(myQualPerCycle.size()))
813 myQualPerCycle.resize(myQualityString.Length());
814 myCountPerCycle.resize(myQualityString.Length());
817 for(
int i=offset; i < myQualityString.Length(); i++)
819 if(myQualityString[i] <= 32)
821 myErrorString =
"Invalid character ('";
822 myErrorString += myQualityString[i];
823 myErrorString +=
"') in quality string.";
835 myCountPerCycle[i] += 1;
845 void FastQFile::reportErrorOnLine()
851 if(myNumErrors <= myNumPrintableErrors)
855 sprintf(buffer,
"ERROR on Line %u: ", myLineNum);
856 std::string message = buffer;
857 message += myErrorString.c_str();
858 logMessage(message.c_str());
864 void FastQFile::reset()
868 resetForEachSequence();
871 myFileName.SetLength(0);
872 myIdentifierMap.clear();
873 myBaseComposition.
clear();
874 myQualPerCycle.clear();
875 myCountPerCycle.clear();
876 myFileProblem =
false;
881 void FastQFile::resetForEachSequence()
883 myLineBuffer.SetLength(0);
884 myErrorString.SetLength(0);
885 myRawSequence.SetLength(0);
886 mySequenceIdLine.SetLength(0);
887 mySequenceIdentifier.SetLength(0);
888 myPlusLine.SetLength(0);
889 myQualityString.SetLength(0);
890 myTempPartialQuality.SetLength(0);
894 void FastQFile::logMessage(
const char* logMessage)
897 if(!myDisableMessages)
899 std::cout << logMessage << std::endl;
906 bool FastQFile::isTimeToQuit()
910 if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
918 void FastQFile::printAvgQual()
920 std::cout << std::endl <<
"Average Phred Quality by Read Index (starts at 0):" << std::endl;
921 std::cout.precision(2);
922 std::cout << std::fixed <<
"Read Index\tAverage Quality"
924 if(myQualPerCycle.size() != myCountPerCycle.size())
927 std::cerr <<
"ERROR calculating the average Qualities per cycle\n";
933 for(
unsigned int i = 0; i < myQualPerCycle.size(); i++)
936 if(myCountPerCycle[i] != 0)
938 avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]);
940 std::cout << i <<
"\t" << avgQual <<
"\n";
941 sumQual += myQualPerCycle[i];
942 count += myCountPerCycle[i];
944 std::cout << std::endl;
948 avgQual = sumQual / count;
950 std::cout <<
"Overall Average Phred Quality = " << avgQual << std::endl;