19 #include "FortranFormat.h"
26 void Pedigree::Prepare(
IFILE & input)
31 void Pedigree::Load(
IFILE & input)
39 int sexCovariate = sexAsCovariate ? GetCovariateID(
"sex") : -1;
41 int textCols = pd.CountTextColumns() + 5;
53 buffer.ReadLine(input);
56 tokens.AddTokens(buffer, WHITESPACE);
58 if (tokens.Length() == 0)
continue;
59 if (tokens[0].SlowCompare(
"end") == 0)
break;
63 if (tokens.Length() < textCols)
65 if (buffer.Length() > 79)
73 pd.ColumnSummary(description);
74 error(
"Loading Pedigree...\n\n"
75 "Expecting %d columns (%s),\n"
76 "but read only %d columns in line %d.\n\n"
77 "The problem line is transcribed below:\n%s\n",
78 textCols, (
const char *) description,
79 tokens.Length(), line, (
const char *) buffer);
82 if (tokens.Length() > textCols && warn && textCols > 5)
84 pd.ColumnSummary(buffer);
85 printf(
"WARNING -- Trailing columns in pedigree file will be ignored\n"
86 " Expecting %d data columns (%s)\n"
87 " However line %d, for example, has %d data columns\n\n",
88 textCols - 5, (
const char *) buffer, line, tokens.Length() - 5);
95 if (oldCount==0 || (p = FindPerson(tokens[0], tokens[1], oldCount))==NULL)
97 if (count == size) Grow();
99 p = persons[count++] =
new Person;
102 p->famid = tokens[field++];
103 p->pid = tokens[field++];
104 p->fatid = tokens[field++];
105 p->motid = tokens[field++];
107 bool failure =
false;
108 p->sex = TranslateSexCode(tokens[field++], failure);
110 error(
"Can't interpret the sex of individual #%d\n"
111 "Family: %s Individual: %s Sex Code: %s", count,
112 (
const char *) p->famid, (
const char *) p->pid,
113 (
const char *) tokens[field-1]);
118 p->covariates[sexCovariate] = p->sex;
120 p->covariates[sexCovariate] = _NAN_;
123 for (
int col = 0; col < pd.columnCount; col++)
124 switch (pd.columns[col])
128 int a = pd.columnHash[col];
131 const char * affection = tokens[field++];
133 switch (toupper(affection[0]))
147 new_status = atoi(affection);
148 if (new_status < 0 || new_status > 2)
149 error(
"Incorrect formating for affection status "
150 "Col %d, Affection %s\n"
151 "Family: %s Individual: %s Status: %s",
152 col, (
const char *) affectionNames[a],
153 (
const char *) p->famid, (
const char *) p->pid,
156 if (new_status != 0 && p->affections[a] != 0 &&
157 new_status != p->affections[a])
158 error(
"Conflict with previous affection status - "
159 "Col %d, Affection %s\n"
160 "Family: %s Individual: %s Old: %d New: %d",
161 col, (
const char *) affectionNames[a],
162 (
const char *) p->famid, (
const char *) p->pid,
163 p->affections[a], new_status);
164 if (new_status) p->affections[a] = new_status;
169 int m = pd.columnHash[col];
173 new_genotype[0] = LoadAllele(m, tokens[field++]);
174 new_genotype[1] = LoadAllele(m, tokens[field++]);
176 if (p->markers[m].isKnown() && new_genotype.isKnown() &&
177 new_genotype != p->markers[m])
181 error(
"Conflict with previous genotype - Col %d, Marker %s\n"
182 "Family: %s Individual: %s Old: %s/%s New: %s/%s",
183 col, (
const char *) markerNames[m],
184 (
const char *) p->famid, (
const char *) p->pid,
185 (
const char *) info->GetAlleleLabel(p->markers[m][0]),
186 (
const char *) info->GetAlleleLabel(p->markers[m][1]),
187 (
const char *) info->GetAlleleLabel(new_genotype[0]),
188 (
const char *) info->GetAlleleLabel(new_genotype[1]));
191 if (new_genotype.isKnown()) p->markers[m] = new_genotype;
195 case pcUndocumentedTraitCovariate :
197 int t = pd.columnHash[col];
198 double new_pheno = _NAN_;
200 if (pd.columns[col] == pcUndocumentedTraitCovariate)
203 const char * value = tokens[field++];
206 if (missing == (
const char *) NULL || strcmp(value, missing) != 0)
207 new_pheno = strtod(value, &flag);
208 if (flag != NULL && *flag) new_pheno = _NAN_;
210 if (p->traits[t] != _NAN_ && new_pheno != _NAN_ &&
211 new_pheno != p->traits[t])
212 error(
"Conflict with previous phenotype - Col %d, Trait %s\n"
213 "Family: %s Individual: %s Old: %f New: %f",
214 col, (
const char *) traitNames[t],
215 (
const char *) p->famid, (
const char *) p->pid,
216 p->traits[t], new_pheno);
218 if (new_pheno != _NAN_) p->traits[t] = new_pheno;
219 if (pd.columns[col] == pcTrait)
break;
223 int c = pd.columnHash[col];
224 double new_covar = _NAN_;
226 if (pd.columns[col] == pcUndocumentedTraitCovariate)
232 const char * value = tokens[field++];
235 if (missing == (
const char *) NULL || strcmp(value, missing) != 0)
236 new_covar = strtod(value, &flag);
237 if (flag != NULL && *flag) new_covar = _NAN_;
239 if (p->covariates[c] != _NAN_ && new_covar != _NAN_ &&
240 new_covar != p->covariates[c])
241 error(
"Conflict with previous value - Col %d, Covariate %s\n"
242 "Family: %s Individual: %s Old: %f New: %f",
243 col, (
const char *) covariateNames[c],
244 (
const char *) p->famid, (
const char *) p->pid,
245 p->covariates[c], new_covar);
247 if (new_covar != _NAN_) p->covariates[c] = new_covar;
252 int c = pd.columnHash[col];
254 if (!p->strings[c].IsEmpty() && p->strings[c] != tokens[field])
255 error(
"Conflict with previous value - Col %d, String %s\n"
256 "Family: %s Individual: %s Old: %s New: %s",
257 col, (
const char *) stringNames[c],
258 (
const char *) p->famid, (
const char *) p->pid,
259 (
const char *) p->strings[c], (
const char *) tokens[field]);
261 p->strings[c] = tokens[field++];
272 const char * zygosity = tokens[field++];
285 new_zygosity = atoi(zygosity);
287 if (p->zygosity != 0 && new_zygosity != p->zygosity)
288 error(
"Conflict with previous zygosity - "
289 "Column %d in pedigree\n"
290 "Family: %s Individual: %s Old: %d New: %d\n",
291 col, (
const char *) p->famid, (
const char *) p->pid,
292 p->zygosity, new_zygosity);
293 p->zygosity = new_zygosity;
299 error(
"Inconsistent Pedigree Description -- Internal Error");
306 void Pedigree::LoadMendel(
IFILE & input)
312 familyHeader.ReadLine(input);
313 individualRecord.ReadLine(input);
320 headers.SetInputFile(input);
321 headers.SetFormat(familyHeader);
323 records.SetInputFile(input);
324 records.SetFormat(individualRecord);
335 int sexCovariate = sexAsCovariate ? GetCovariateID(
"sex") : -1;
337 while (!
ifeof(input))
343 familySize = headers.GetNextInteger();
344 headers.GetNextField(famid);
349 if (
ifeof(input) && familySize == 0)
352 error(
"Blank family id encountered\n");
356 for (
int i = 0; i < familySize; i++)
362 records.GetNextField(p->pid);
363 records.GetNextField(p->fatid);
364 records.GetNextField(p->motid);
366 if (p->pid.IsEmpty())
367 error(
"No unique identifier for individual #%d in family %s\n",
368 i + 1, (
const char *) famid);
370 if (p->pid.Compare(
".") == 0)
371 error(
"Family %s has an individual named '.', but this code is\n"
372 "reserved to indicate missing parents\n");
374 if (p->fatid.IsEmpty()) p->fatid =
".";
375 if (p->motid.IsEmpty()) p->motid =
".";
378 char sex = records.GetNextCharacter();
400 error(
"Can't interpret the sex of individual #%d\n"
401 "Family: %s Individual: %s Sex Code: %s", count,
402 (
const char *) p->famid, (
const char *) p->pid, sex);
408 p->covariates[sexCovariate] = p->sex;
410 p->covariates[sexCovariate] = _NAN_;
414 char zygosity = records.GetNextCharacter();
419 p->zygosity = (zygosity -
' ') * 2 - 1;
421 affectionStem.Clear();
422 for (
int col = 0; col < pd.columnCount; col++)
423 switch (pd.columns[col])
427 int a = pd.columnHash[col];
433 if (affectionStem.Length() == 0 ||
434 affectionNames[a].CompareToStem(affectionStem) != 0)
436 affectionStem.Copy(affectionNames[a], 0, affectionNames[a].FindChar(
'>') + 1);
437 records.GetNextField(phenotype);
438 affectionCode = affectionStem + phenotype;
442 if (phenotype.IsEmpty())
443 p->affections[a] = 0;
445 p->affections[a] = affectionCode.Compare(affectionNames[a]) == 0 ? 2 : 1;
451 int m = pd.columnHash[col];
453 records.GetNextField(phenotype);
455 if (phenotype.IsEmpty())
457 p->markers[m].one = p->markers[m].two = 0;
461 int separator = phenotype.FindChar(
'/');
462 if (separator == -1) separator = phenotype.FindChar(
'|');
465 error(
"At marker %s, person %s in family %s has genotype %s.\n"
466 "This genotype is not in the 'al1/al2' format.\n",
467 (
const char *) markerNames[m],
468 (
const char *) p->pid,
469 (
const char *) p->famid,
470 (
const char *) phenotype);
472 allele1.Copy(phenotype, 0, separator);
475 allele2.Copy(phenotype, separator + 1, 8);
480 int one = info->alleleNumbers.Integer(allele1);
484 if (info->freq.Length() == 0)
485 one = info->NewAllele(allele1);
487 error(
"At marker %s, person %s in family %s has genotype %s.\n"
488 "However, '%s' is not a valid allele for this marker.\n",
489 (
const char *) markerNames[m],
490 (
const char *) p->pid,
491 (
const char *) p->famid,
492 (
const char *) phenotype,
493 (
const char *) allele1);
496 int two = info->alleleNumbers.Integer(allele2);
500 if (info->freq.Length() == 0)
501 two = info->NewAllele(allele2);
503 error(
"At marker %s, person %s in family %s has genotype %s.\n"
504 "However, '%s' is not a valid allele for this marker.\n",
505 (
const char *) markerNames[m],
506 (
const char *) p->pid,
507 (
const char *) p->famid,
508 (
const char *) phenotype,
509 (
const char *) allele2);
512 p->markers[m].one = one;
513 p->markers[m].two = two;
523 error(
"Inconsistent Pedigree Description -- Internal Error");
533 void Pedigree::Prepare(
const char * filename)
544 filenames.AddColumns(filename,
',');
546 if (filenames.Length() <= 1)
550 printf(
"AUTOMATIC MERGE ENABLED: Detected multiple datafile names, separated by commas...\n");
554 for (
int i = 0; i < filenames.Length(); i++)
556 printf(
" AUTOMATIC MERGE: Reading data file '%s' ...\n", (
const char *) filenames[i]);
557 multiPd[i].Load(filenames[i],
false);
560 multiFileCount = filenames.Length();
564 void Pedigree::Load(
const char * filename,
bool allowFailures)
566 if (multiFileCount <= 1)
570 if (f == NULL && allowFailures)
575 "The pedigree file %s cannot be opened\n\n"
576 "Common causes for this problem are:\n"
577 " * You might not have used the correct options to specify input file names,\n"
578 " please check the program documentation for information on how to do this\n\n"
579 " * The file doesn't exist or the filename might have been misspelt\n\n"
580 " * The file exists but it is being used by another program which you will need\n"
582 " * The file is larger than 2GB and you haven't compiled this application with\n"
583 " large file support.\n\n",
593 filenames.AddColumns(filename,
',');
595 if (filenames.Length() != multiFileCount)
596 error(
"Different numbers of comma separated data and pedigree file names provided\n");
598 for (
int i = 0; i < filenames.Length(); i++)
600 printf(
" AUTOMATIC MERGE: Datafile '%s' matched to pedigree '%s' ...\n",
601 (
const char *) multiPd[i].filename, (
const char *) filenames[i]);
608 error(
"The pedigree file '%s' cannot be opened\n\n",
609 (
const char *) filenames[i]);
619 int Pedigree::TranslateSexCode(
const char * code,
bool & failure)
639 int result = atoi(code);
641 if (result != 0 && result != 1 && result != 2)