ProteoWizard
Serializer_pepXML_Test.cpp
Go to the documentation of this file.
1//
2// $Id$
3//
4//
5// Original author: Matt Chambers <matt.chambers .@. vanderbilt.edu>
6//
7// Copyright 2010 Vanderbilt University - Nashville, TN 37232
8//
9// Licensed under the Apache License, Version 2.0 (the "License");
10// you may not use this file except in compliance with the License.
11// You may obtain a copy of the License at
12//
13// http://www.apache.org/licenses/LICENSE-2.0
14//
15// Unless required by applicable law or agreed to in writing, software
16// distributed under the License is distributed on an "AS IS" BASIS,
17// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18// See the License for the specific language governing permissions and
19// limitations under the License.
20//
21
22
23#include "Serializer_pepXML.hpp"
24#include "Diff.hpp"
25#include "References.hpp"
26#include "examples.hpp"
31#include "TextWriter.hpp"
32#include "boost/range/adaptor/transformed.hpp"
33#include "boost/range/algorithm/max_element.hpp"
34#include "boost/range/algorithm/min_element.hpp"
35#include "boost/range/algorithm_ext/erase.hpp"
36#include <cstring>
37
38
39using namespace pwiz::identdata;
40using namespace pwiz::identdata::examples;
41using namespace pwiz::util;
42namespace proteome = pwiz::proteome;
43
44ostream* os_ = 0;
45
47{
48 typedef int result_type;
49 int operator()(const EnzymePtr& x) const {return x->terminalSpecificity;}
50};
51
53{
54 typedef int result_type;
55 int operator()(const EnzymePtr& x) const {return x->missedCleavages;}
56};
57
59{
60 UserParamNameIs(const string& name) : name_(name) {}
61
62 bool operator() (const UserParam& up) const { return up.name == name_; }
63
64 string name_;
65};
66
68{
69 mzid.bibliographicReference.clear();
71 mzid.auditCollection.clear();
72 mzid.provider = Provider();
73 mzid.dataCollection.inputs.sourceFile.clear();
74
75 BOOST_FOREACH(AnalysisSoftwarePtr& as, mzid.analysisSoftwareList)
76 {
77 as->URI.clear();
78 as->customizations.clear();
79 as->contactRolePtr.reset();
80 }
81
83
84 // pepXML only provides a single min_number_termini and max_num_internal_cleavages for all enzymes
85 int minSpecificity = *boost::range::min_element(sip.enzymes.enzymes | boost::adaptors::transformed(EnzymePtr_specificity()));
86 int maxMissedCleavages = *boost::range::max_element(sip.enzymes.enzymes | boost::adaptors::transformed(EnzymePtr_missedCleavages()));
87 BOOST_FOREACH(const EnzymePtr& ez, sip.enzymes.enzymes)
88 {
89 ez->terminalSpecificity = (proteome::Digestion::Specificity) minSpecificity;
90 ez->missedCleavages = maxMissedCleavages;
91 }
92
93 // pepXML doesn't map these elements
94 sip.massTable.clear();
95 sip.threshold.clear();
96 sip.databaseFilters.clear();
97 sip.databaseTranslation.reset();
98
99 // pepXML doesn't map these attributes
100 mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->name.clear();
101 mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->version.clear();
102 mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->releaseDate.clear();
103 mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->databaseName.clear();
104
105 // pepXML doesn't reliably store location or file format
106 string& location = mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->location;
107 location = BFS_STRING(bfs::path(location).replace_extension("").filename());
108 mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->fileFormat = CVParam();
109
110 string& location2 = mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->location;
111 location2 = BFS_STRING(bfs::path(location2).replace_extension("").filename());
112
113 // pepXML doesn't support protein sequences
114 BOOST_FOREACH(DBSequencePtr& dbSequence, mzid.sequenceCollection.dbSequences)
115 {
116 dbSequence->seq.clear();
117 dbSequence->length = 0;
118 dbSequence->id = "DBSeq_" + dbSequence->accession;
119 }
120
121 // pepXML can only support one mass type (we pick the max mass in case one of them is 0)
122 BOOST_FOREACH(PeptidePtr& peptide, mzid.sequenceCollection.peptides)
123 BOOST_FOREACH(ModificationPtr& mod, peptide->modification)
124 mod->monoisotopicMassDelta = mod->avgMassDelta = max(mod->monoisotopicMassDelta, mod->avgMassDelta);
125
126 // pepXML doesn't support fragment metadata
127 mzid.dataCollection.analysisData.spectrumIdentificationList[0]->fragmentationTable.clear();
128
129 BOOST_FOREACH(SpectrumIdentificationResultPtr& sir, mzid.dataCollection.analysisData.spectrumIdentificationList[0]->spectrumIdentificationResult)
130 BOOST_FOREACH(SpectrumIdentificationItemPtr& sii, sir->spectrumIdentificationItem)
131 {
132 // pepXML doesn't support fragment metadata or mass tables
133 sii->fragmentation.clear();
134 sii->massTablePtr.reset();
135
136 for (size_t i=0; i < sii->peptideEvidencePtr.size(); ++i)
137 {
138 PeptideEvidence& pe = *sii->peptideEvidencePtr[i];
139
140 // pepXML does not store peptide start and end offsets
141 pe.start = pe.end = 0;
142
143 // pepXML's alternative_proteins do not store prev/next AA or missed cleavages
144 if (i > 0)
145 pe.pre = pe.post = '?';
146 }
147 }
148
149 // pepXML doesn't have protein assembly
152
153 // pepXML expects the residues to be '.' or an amino acid list
154 BOOST_FOREACH(SearchModificationPtr& sm, mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->modificationParams)
155 if (sm->residues.empty())
156 sm->residues.push_back('.');
157}
158
159void testTranslation(const string& str)
160{
161 // test that search engine name is written using preferred name
162 unit_assert(bal::contains(str, "search_engine=\"Mascot\""));
163
164 // test that score names are written using preferred name
165 unit_assert(bal::contains(str, "name=\"ionscore\""));
166 unit_assert(bal::contains(str, "name=\"homologyscore\""));
167 unit_assert(bal::contains(str, "name=\"identityscore\""));
168 unit_assert(bal::contains(str, "name=\"expect\""));
169 unit_assert(bal::contains(str, "name=\"an extra score\""));
170
171 // test that nativeID is preserved
172 unit_assert(bal::contains(str, "spectrumNativeID=\"controllerType=0 controllerNumber=1 scan=420\""));
173}
174
176{
177 if (os_) *os_ << "begin testSerialize" << endl;
178
179 Serializer_pepXML serializer(config);
180 ostringstream oss;
181 serializer.write(oss, mzid, "tiny.pepXML");
182
183 if (os_) *os_ << "oss:\n" << oss.str() << endl;
184 if (config.readSpectrumQueries)
185 testTranslation(oss.str());
186
187 shared_ptr<istringstream> iss(new istringstream(oss.str()));
188 IdentData mzid2;
189 serializer.read(iss, mzid2);
190
191 References::resolve(mzid2);
192
193 // remove DecoyPrefix userParam that is redundant with the decoy DB prefix cvParam
194 boost::range::remove_erase_if(mzid2.analysisProtocolCollection.spectrumIdentificationProtocol[0]->additionalSearchParams.userParams, UserParamNameIs("DecoyPrefix"));
195
197 if (os_ && diff) *os_ << diff << endl;
199}
200
202{
203 IdentData mzid;
207
208
209 // test non-specific enzyme
210 mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.clear();
211 EnzymePtr noEnzyme(new Enzyme);
212 noEnzyme->id = "ENZ_1";
213 noEnzyme->cTermGain = "OH";
214 noEnzyme->nTermGain = "H";
215 noEnzyme->missedCleavages = 2;
216 noEnzyme->minDistance = 1;
217 noEnzyme->terminalSpecificity = proteome::Digestion::NonSpecific;
218 noEnzyme->siteRegexp = "(?<=[KR])";
219 noEnzyme->enzymeName.set(MS_Trypsin_P);
220 mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.push_back(noEnzyme);
222
223
224 // test sense="N" enzymes
225 mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.clear();
226 EnzymePtr aspN(new Enzyme);
227 aspN->id = "ENZ_1";
228 aspN->cTermGain = "OH";
229 aspN->nTermGain = "H";
230 aspN->missedCleavages = 2;
231 aspN->minDistance = 1;
232 aspN->terminalSpecificity = proteome::Digestion::FullySpecific;
233 aspN->siteRegexp = "(?=[BD])";
234 aspN->enzymeName.set(MS_Asp_N);
235 mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.push_back(aspN);
237
238 aspN->missedCleavages = 4;
239 aspN->minDistance = 2;
240 aspN->terminalSpecificity = proteome::Digestion::SemiSpecific;
241 aspN->siteRegexp = "(?=[BND])";
242 aspN->enzymeName.clear();
243 aspN->enzymeName.userParams.push_back(UserParam("custom"));
245
246
247 // test with readSpectrumQueries == false
248
249 // clear the original SequenceCollection
250 mzid.sequenceCollection.dbSequences.clear();
251 mzid.sequenceCollection.peptides.clear();
253
254 // clear the original analysis data
255 mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->spectrumIDFormat = CVParam();
256 mzid.analysisCollection.spectrumIdentification[0]->spectrumIdentificationListPtr.reset();
259
261}
262
264{
265 PepXMLSpecificity result;
266 Enzyme ez;
267
269 result = pepXMLSpecificity(ez);
271 unit_assert_operator_equal("KR", result.cut);
273
274 ez.enzymeName.clear();
276 result = pepXMLSpecificity(ez);
278 unit_assert_operator_equal("KR", result.cut);
280
281 ez.enzymeName.clear();
282 ez.enzymeName.userParams.push_back(UserParam("trypsin/p"));
283 result = pepXMLSpecificity(ez);
285 unit_assert_operator_equal("KR", result.cut);
287
288 ez.enzymeName.clear();
289 ez.name = "trypsin/p";
290 result = pepXMLSpecificity(ez);
292 unit_assert_operator_equal("KR", result.cut);
294
295 ez.name.clear();
297 result = pepXMLSpecificity(ez);
299 unit_assert_operator_equal("BD", result.cut);
301
302 ez.enzymeName.clear();
303 ez.siteRegexp = proteome::Digestion::getCleavageAgentRegex(MS_Trypsin);
304 result = pepXMLSpecificity(ez);
306 unit_assert_operator_equal("KR", result.cut);
308
309 ez.siteRegexp = proteome::Digestion::getCleavageAgentRegex(MS_Trypsin_P);
310 result = pepXMLSpecificity(ez);
312 unit_assert_operator_equal("KR", result.cut);
314
315 ez.siteRegexp = proteome::Digestion::getCleavageAgentRegex(MS_Asp_N);
316 result = pepXMLSpecificity(ez);
318 unit_assert_operator_equal("BD", result.cut);
320
321
322 // REMEMBER: update the pepXMLSpecificity function when new CV enzymes are added
323 bool allCleavageAgentsHandled = true;
324 ez.siteRegexp.clear();
325 BOOST_FOREACH(CVID cleavageAgent, proteome::Digestion::getCleavageAgents())
326 try
327 {
328 ez.enzymeName.clear();
330 result = pepXMLSpecificity(ez);
331 }
332 catch (exception& e)
333 {
334 cerr << e.what() << endl;
335 allCleavageAgentsHandled = false;
336 }
337 unit_assert(allCleavageAgentsHandled);
338
339
340 ez.siteRegexp = "(?<=[QWERTY])(?=[QWERTY])";
341 result = pepXMLSpecificity(ez);
343 unit_assert_operator_equal("QWERTY", result.cut);
344 unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.no_cut);
345
346 ez.siteRegexp = "(?<![QWERTY])(?![QWERTY])";
347 result = pepXMLSpecificity(ez);
349 unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
350 unit_assert_operator_equal("QWERTY", result.no_cut);
351
352 ez.siteRegexp = "(?<=[QWERTY])";
353 result = pepXMLSpecificity(ez);
355 unit_assert_operator_equal("QWERTY", result.cut);
357
358 ez.siteRegexp = "(?=[QWERTY])";
359 result = pepXMLSpecificity(ez);
361 unit_assert_operator_equal("QWERTY", result.cut);
363
364 ez.siteRegexp = "(?<![QWERTY])";
365 result = pepXMLSpecificity(ez);
367 unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
369
370 ez.siteRegexp = "(?![QWERTY])";
371 result = pepXMLSpecificity(ez);
373 unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
375}
376
377
379{
380 unit_assert_operator_equal("basename.123.123", stripChargeFromConventionalSpectrumId("basename.123.123.2"));
381 unit_assert_operator_equal("basename.ext.123.123", stripChargeFromConventionalSpectrumId("basename.ext.123.123.12"));
382 unit_assert_operator_equal("basename.2.2", stripChargeFromConventionalSpectrumId("basename.2.2.2"));
383 unit_assert_operator_equal("basename.ext.3.3", stripChargeFromConventionalSpectrumId("basename.ext.3.3.3"));
384 unit_assert_operator_equal("basename.123.123", stripChargeFromConventionalSpectrumId("basename.123.123"));
385 unit_assert_operator_equal("basename.ext.123.123", stripChargeFromConventionalSpectrumId("basename.ext.123.123"));
386 unit_assert_operator_equal("locus:1.1.1.123", stripChargeFromConventionalSpectrumId("locus:1.1.1.123.2"));
389}
390
391
393{
397
401
404
409
410
413
416
419
422
423
424 unit_assert_operator_equal(MS_Thermo_nativeID_format, nativeIdStringToCVID("controllerType=1 controllerNumber=0 scan=1234"));
425 unit_assert_operator_equal(MS_WIFF_nativeID_format, nativeIdStringToCVID("sample=1 period=1 cycle=1234 experiment=2"));
426}
427
428
429int main(int argc, char** argv)
430{
431 TEST_PROLOG(argc, argv)
432
433 try
434 {
435 if (argc>1 && !strcmp(argv[1],"-v")) os_ = &cout;
440 }
441 catch (exception& e)
442 {
443 TEST_FAILED(e.what())
444 }
445 catch (...)
446 {
447 TEST_FAILED("Caught unknown exception.")
448 }
449
451}
#define BFS_STRING(p)
void diff(const string &filename1, const string &filename2)
KernelTraitsBase< Kernel >::space_type::abscissa_type x
void testPepXMLSpecificity()
void testSerializeReally(IdentData &mzid, const Serializer_pepXML::Config &config)
int main(int argc, char **argv)
void stripUnmappedMetadata(IdentData &mzid)
void testStripChargeFromConventionalSpectrumId()
void testSerialize()
void testTranslation()
ostream * os_
MZIDData <-> pepXML stream serialization.
void read(boost::shared_ptr< std::istream > is, IdentData &mzid, const pwiz::util::IterationListenerRegistry *=0) const
read in MZIDData object from a pepXML istream
void write(std::ostream &os, const IdentData &mzid, const std::string &filepath, const pwiz::util::IterationListenerRegistry *=0) const
write MZIDData object to ostream as pepXML
MS_X_Tandem
X!Tandem: X!Tandem was used to analyze the spectra.
Definition cv.hpp:4833
MS_Trypsin
Trypsin: Enzyme trypsin.
Definition cv.hpp:4179
MS_Trypsin_P
Trypsin/P: Cleavage agent Trypsin/P.
Definition cv.hpp:4350
MS_Thermo_nativeID_format
Thermo nativeID format: Native format defined by controllerType=xsd:nonNegativeInteger controllerNumb...
Definition cv.hpp:2976
MS_MyriMatch
MyriMatch: Tabb Lab software for directly comparing peptides in a database to tandem mass spectra.
Definition cv.hpp:5157
MS_WIFF_nativeID_format
WIFF nativeID format: Native format defined by sample=xsd:nonNegativeInteger period=xsd:nonNegativeIn...
Definition cv.hpp:2982
MS_Comet
Comet: Comet open-source sequence search engine developed at the University of Washington.
Definition cv.hpp:7200
CVID_Unknown
Definition cv.hpp:114
MS_Asp_N
Asp-N: Endoproteinase Asp-N.
Definition cv.hpp:4320
MS_SEQUEST
SEQUEST: The name of the SEQUEST search engine.
Definition cv.hpp:4053
MS_SEQUEST_xcorr
SEQUEST:xcorr: The SEQUEST result 'XCorr'.
Definition cv.hpp:3924
MS_MyriMatch_MVH
MyriMatch:MVH: Using the multivariate hypergeometric distribution and a peak list divided into severa...
Definition cv.hpp:5169
MS_Comet_xcorr
Comet:xcorr: The Comet result 'XCorr'.
Definition cv.hpp:7203
PWIZ_API_DECL void resolve(ContactRole &cr, IdentData &mzid)
PWIZ_API_DECL void initializeBasicSpectrumIdentification(IdentData &mzid)
PWIZ_API_DECL PepXMLSpecificity pepXMLSpecificity(const Enzyme &ez)
converts an identdata::Enzyme into a pepXML cut/no_cut/sense tuple
PWIZ_API_DECL CVID nativeIdStringToCVID(const std::string &id)
attempts to convert a period-delimited id into a nativeID format (e.g. "1.0.123" appears to be a Ther...
PWIZ_API_DECL const std::string & softwareCVIDToPepXMLSoftwareName(CVID softwareCVID)
converts a software CVID to the preferred name for that software in pepXML; an unrecognized software ...
PWIZ_API_DECL proteome::Peptide peptide(const Peptide &peptide)
creates a proteome::Peptide from an identdata::Peptide
PWIZ_API_DECL std::string stripChargeFromConventionalSpectrumId(const std::string &id)
strips charge state from known conventions of the pepXML spectrum attribute; used to find a unique id...
PWIZ_API_DECL CVID pepXMLScoreNameToCVID(CVID softwareCVID, const std::string &scoreName)
for a given software CVID, converts a pepXML score name into its corresponding CVID,...
PWIZ_API_DECL const std::string & scoreCVIDToPepXMLScoreName(CVID softwareCVID, CVID scoreCVID)
for a given software CVID, converts a score CVID into the preferred name for that score in pepXML; an...
PWIZ_API_DECL CVID cleavageAgent(const Enzyme &ez)
returns a cleavage agent CVID for an identdata::Enzyme
PWIZ_API_DECL CVID pepXMLSoftwareNameToCVID(const std::string &softwareName)
converts a software name stored in pepXML software element into its corresponding CVID,...
int operator()(const EnzymePtr &x) const
int operator()(const EnzymePtr &x) const
UserParamNameIs(const string &name)
bool operator()(const UserParam &up) const
represents a tag-value pair, where the tag comes from the controlled vocabulary
Calculate diffs of objects in a ProteoWizard data model hierarchy.
Definition diff_std.hpp:143
void set(CVID cvid, const std::string &value="", CVID units=CVID_Unknown)
set/add a CVParam (not recursive)
void clear()
clears the collections
std::vector< UserParam > userParams
a collection of uncontrolled user terms
Uncontrolled user parameters (essentially allowing free text). Before using these,...
std::string name
the name for the parameter.
std::vector< SpectrumIdentificationPtr > spectrumIdentification
std::vector< SpectrumIdentificationListPtr > spectrumIdentificationList
ProteinDetectionListPtr proteinDetectionListPtr
std::vector< SpectrumIdentificationProtocolPtr > spectrumIdentificationProtocol
Implementation of EnzymeType from the mzIdentML schema.
ParamContainer enzymeName
std::vector< EnzymePtr > enzymes
Implementation of the MzIdentMLType from the mzIdentML schema.
SequenceCollection sequenceCollection
DataCollection dataCollection
AnalysisProtocolCollection analysisProtocolCollection
std::vector< AnalysisSoftwarePtr > analysisSoftwareList
std::vector< ContactPtr > auditCollection
std::vector< BibliographicReferencePtr > bibliographicReference
AnalysisSampleCollection analysisSampleCollection
AnalysisCollection analysisCollection
std::vector< SourceFilePtr > sourceFile
Implementation of PeptideEvidenceType from the mzIdentML schema.
Implementation of ProteinDetectionType from the mzIdentML schema.
Implementation of ProviderType from the mzIdentML schema.
std::vector< DBSequencePtr > dbSequences
std::vector< PeptideEvidencePtr > peptideEvidence
std::vector< PeptidePtr > peptides
Implementation of SpectrumIdentificationProtocolType from the mzIdentML schema.
#define unit_assert(x)
Definition unit.hpp:85
#define TEST_EPILOG
Definition unit.hpp:183
#define TEST_FAILED(x)
Definition unit.hpp:177
#define unit_assert_operator_equal(expected, actual)
Definition unit.hpp:92
#define TEST_PROLOG(argc, argv)
Definition unit.hpp:175