001package com.hfg.bio.seq.format; 002 003import java.io.BufferedReader; 004import java.io.BufferedWriter; 005import java.io.IOException; 006import java.io.OutputStream; 007import java.io.OutputStreamWriter; 008import java.io.Reader; 009import java.io.StringWriter; 010import java.io.Writer; 011import java.text.ParseException; 012import java.text.SimpleDateFormat; 013import java.util.ArrayList; 014import java.util.Collection; 015import java.util.Collections; 016import java.util.List; 017import java.util.Map; 018import java.util.Set; 019import java.util.logging.Level; 020import java.util.logging.Logger; 021import java.util.regex.Matcher; 022import java.util.regex.Pattern; 023 024import com.hfg.bio.DbXref; 025import com.hfg.bio.seq.BioSequencePlus; 026import com.hfg.bio.seq.BioSequenceType; 027import com.hfg.bio.seq.Clone; 028import com.hfg.bio.seq.SeqLocation; 029import com.hfg.bio.seq.SeqTopology; 030import com.hfg.bio.seq.format.feature.FeatureQualifier; 031import com.hfg.bio.seq.format.feature.SeqFeature; 032import com.hfg.bio.seq.BioSequence; 033import com.hfg.bio.seq.BioSequenceFactory; 034import com.hfg.bio.seq.format.feature.genbank.*; 035import com.hfg.bio.seq.format.feature.qualifier.MolType; 036import com.hfg.bio.seq.format.genbank.GenBankKeyword; 037import com.hfg.bio.seq.format.genbank.GenBankSubkeyword; 038import com.hfg.bio.seq.format.genbank.InvalidGenBankKeywordException; 039import com.hfg.bio.seq.format.genbank.InvalidGenBankSubkeywordException; 040import com.hfg.bio.taxonomy.ncbi.NCBIGenBankDivision; 041import com.hfg.bio.taxonomy.ncbi.NCBITaxon; 042import com.hfg.citation.Author; 043import com.hfg.citation.CitationType; 044import com.hfg.citation.Journal; 045import com.hfg.citation.PatentData; 046import com.hfg.datetime.DateUtil; 047import com.hfg.util.StringBuilderPlus; 048import com.hfg.util.StringUtil; 049import com.hfg.util.collection.CollectionUtil; 050import com.hfg.util.collection.OrderedMap; 051import com.hfg.util.io.LettersOnlyReader; 052 053//------------------------------------------------------------------------------ 054/** 055 GenBank sequence format. 056 <p> 057 See <a href='ftp://ftp.ncbi.nlm.nih.gov/genbank/gbrel.txt'>ftp://ftp.ncbi.nlm.nih.gov/genbank/gbrel.txt</a> 058 </p> 059 <p> 060 See <a href='http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html'>http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html</a> 061 </p> 062 @author J. Alex Taylor, hairyfatguy.com 063 */ 064//------------------------------------------------------------------------------ 065// com.hfg Library 066// 067// This library is free software; you can redistribute it and/or 068// modify it under the terms of the GNU Lesser General Public 069// License as published by the Free Software Foundation; either 070// version 2.1 of the License, or (at your option) any later version. 071// 072// This library is distributed in the hope that it will be useful, 073// but WITHOUT ANY WARRANTY; without even the implied warranty of 074// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 075// Lesser General Public License for more details. 076// 077// You should have received a copy of the GNU Lesser General Public 078// License along with this library; if not, write to the Free Software 079// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 080// 081// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 082// jataylor@hairyfatguy.com 083//------------------------------------------------------------------------------ 084 085public class GenBank<T extends BioSequence> extends ReadableSeqFormatBase<T> implements WritableSeqFormat<T> 086{ 087 088 // Variables used during parsing 089 private T mCurrentSeq; 090 private GenBankKeyword mCurrentKeyword; 091 private GenBankSubkeyword mCurrentSubkeyword; 092 private GenBankFeature mCurrentFeature; 093 private GenBankFeatureQualifier mCurrentFeatureQualifier; 094 private SeqCitation mCurrentReference; 095 private Integer mSeqLengthFromLocusLine; 096 097 private SimpleDateFormat mDateFormat = new SimpleDateFormat("dd-MMM-yyyy"); 098 099 100 private int mMaxExceptionsPerRecord = 0; 101 102// private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s{7}(\\w+)?\\s+(?:\\w+\\s+)?(\\d+)\\s(?:bp|aa)\\s+(?:[\\-\\w]+)?(?:\\s+(\\w+))?\\s+(\\w{3})\\s+(\\S{11})"); 103// private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s{7}(\\w+)?\\s+(?:\\w+\\s+)?(\\d+)\\s(?:bp|aa)(?:\\s+((?:[sdm]s-)?(?:NA|DNA|RNA|tRNA|rRNA|mRNA|uRNA)))?(?:\\s+(linear|circular))?(?:\\s+(\\w{3}))?(?:\\s+(\\S{10,11}))?"); 104 private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s+(\\S+)?\\s+(?:\\S+\\s+)?(\\d+)\\s(?:bp|aa)(?:\\s+((?:[sdm]s-)?(?:NA|DNA|RNA|tRNA|rRNA|mRNA|uRNA|cRNA)))?(?:\\s+(linear|circular))?(?:\\s+(\\w{3}))?(?:\\s+(\\S{10,11}))?"); 105 private static final Pattern sFeatureQualifierPattern = Pattern.compile("/(\\S+?)(?:=(.+))?"); 106 private static final Pattern sReferenceLocationPattern = Pattern.compile("\\(bases (\\d+) to (\\d+)\\)"); 107 private static final Pattern sReferencePatentPattern = Pattern.compile("Patent: (\\w{2} \\S+)\\s+(\\d+)\\s+(\\d{2}-\\w{3}-\\d{4});(.+)?"); 108 private static final Pattern sReferenceDirectSubmissionPattern = Pattern.compile("Submitted \\((\\d{2}-\\w{3}-(\\d{4}))\\)\\s+(.*)"); 109 110 private static final Pattern sPatentLocationPattern = Pattern.compile(".+, \\w{2}"); 111 private static final Pattern sPatentParensLocationPattern = Pattern.compile(".+ \\(\\w{2}\\)"); 112 113 // Examples: 114 // Thesis 115 // Thesis (1996) Utrecht University, The Netherlands 116 private static final Pattern sReferenceThesisPattern = Pattern.compile("Thesis(?: \\((\\d{4})\\)\\s+(.*))?"); 117 118 // Examples: 119 // Proc. Natl. Acad. Sci. U.S.A. 82 (3), 844-848 (1985) 120 // Front Immunol 9, 1079 (2018) 121 // Nat Commun (2018) In press 122 // Dev. Comp. Immunol. 25 (5-6), 387-401 123 // J. Exp. Zool. 295B (1), 45-58 (2003) 124 // PLoS ONE 8 (8), E70650 (2013) 125 // Mol. Phylogenet. Evol. 94 (Pt B), 577-590 (2016) 126 private static final Pattern sReferenceJournalPattern = Pattern.compile("(.+?)(?:\\s+(\\S+)(?:\\s+\\(([^\\)]+)\\))?,\\s+(\\d+(?:\\-\\d+)?|E\\d+))?(?:\\s+\\((\\d{4})\\))?(?: In press)?", Pattern.CASE_INSENSITIVE); 127 128 private static final SimpleDateFormat sDateFormat = new SimpleDateFormat("dd-MMM-yyyy"); 129 130 public static final String COMMENT_ATTR = "Comment"; 131 public static final String NCBI_GI_ATTR = "NCBI GI"; 132 public static final String CONTIG_ATTR = "Contig"; 133 134 private final static Logger LOGGER = Logger.getLogger(GenBank.class.getName()); 135 136 static 137 { 138 LOGGER.setLevel(Level.WARNING); 139 LOGGER.setUseParentHandlers(true); 140 } 141 142 //########################################################################### 143 // CONSTRUCTORS 144 //########################################################################### 145 146 //--------------------------------------------------------------------------- 147 public GenBank(BioSequenceFactory<T> inSeqFactory) 148 { 149 super(inSeqFactory); 150 } 151 152 //########################################################################### 153 // PUBLIC METHODS 154 //########################################################################### 155 156 //--------------------------------------------------------------------------- 157 public static Logger getLogger() 158 { 159 return LOGGER; 160 } 161 162 //--------------------------------------------------------------------------- 163 public boolean isEndOfRecord(String inLine) 164 { 165 // Trying for something slightly more efficient than inLine.trim().equals("//") 166 return inLine.startsWith("//") && 2 == inLine.trim().length(); 167 } 168 169 //--------------------------------------------------------------------------- 170 public boolean hasJanusDelimiter() 171 { 172 return false; 173 } 174 175 //--------------------------------------------------------------------------- 176 /** 177 Specify the maximum number of Exceptions to tolerate per record. Defaults to zero. 178 This mechanism will only work with sequences objects that implement the BioSequencePlus interface. 179 If a record produces less than the specified maximum number of Exceptions, the 180 Exceptions can be retrieved via the getParseExceptions() method on the 181 BioSequencePlus sequence object. 182 * @param inValue the maximum number of Exceptions to tolerate per record 183 * @return this format object to facilitate method chaining. 184 */ 185 public GenBank<T> setMaxExceptionsPerRecord(int inValue) 186 { 187 mMaxExceptionsPerRecord = inValue; 188 return this; 189 } 190 191 //--------------------------------------------------------------------------- 192 public T readRecord(BufferedReader inReader) 193 throws SeqIOException 194 { 195 initRecordParsing(); 196 197 int lineCount = 0; 198 int maxPreLocusLines = 50; 199 boolean locusLineFound = false; 200 boolean originLineFound = false; 201 202 mCurrentSeq = getBioSequenceFactory().createSeqObj(); 203 204 try 205 { 206 String line; 207 while ((line = inReader.readLine()) != null) 208 { 209 lineCount++; 210 try 211 { 212 if (!locusLineFound) 213 { 214 if (lineCount > maxPreLocusLines) 215 { 216 throw new SeqFormatException("No GenBank " + GenBankKeyword.LOCUS 217 + " line found within " + maxPreLocusLines 218 + " lines of the start!"); 219 } 220 221 try 222 { 223 GenBankKeyword keyword = getLineKeyword(line); 224 if (GenBankKeyword.LOCUS.equals(keyword)) 225 { 226 locusLineFound = true; 227 } 228 else 229 { 230 continue; 231 } 232 } 233 catch (InvalidGenBankKeywordException e) 234 { 235 // Ignore 236 continue; 237 } 238 } 239 else if (isEndOfRecord(line)) 240 { 241 break; 242 } 243 244 if (locusLineFound 245 && StringUtil.isSet(line)) 246 { 247 parseLine(line); 248 249 if (GenBankKeyword.ORIGIN.equals(mCurrentKeyword)) 250 { 251 originLineFound = true; 252 break; 253 } 254 } 255 } 256 catch(Exception e) 257 { 258 SeqIOException seqIOException = new SeqIOException("Problem parsing " 259 + (StringUtil.isSet(mCurrentSeq.getID()) ? mCurrentSeq.getID() + " " : "") 260 + "record line " + lineCount + " : " + StringUtil.singleQuote(line), e); 261 262 if (mMaxExceptionsPerRecord > 0 263 && mCurrentSeq instanceof BioSequencePlus 264 && (! ((BioSequencePlus) mCurrentSeq).hadParseExceptions() 265 || ((BioSequencePlus) mCurrentSeq).getParseExceptions().size() < mMaxExceptionsPerRecord)) 266 { 267 ((BioSequencePlus) mCurrentSeq).addParseException(seqIOException); 268 GenBank.getLogger().warning(e.getMessage()); 269 } 270 else 271 { 272 throw seqIOException; 273 } 274 } 275 } 276 277 if (! locusLineFound) 278 { 279 throw new SeqFormatException("No GenBank LOCUS line found!"); 280 } 281 282 if (originLineFound) 283 { 284 // The rest of the record is assumed to be sequence 285 286 // Cleanup the sequence to remove spaces and numbers 287 // Reader filterReader = new GenBankSeqFilterReader(inReader); 288 // mCurrentSeq.setSequence(filterReader); 289 290 // filterReader.close(); 291 mCurrentSeq.setSequence(inReader); 292 } 293 294 inReader.close(); 295 } 296 catch (Exception e) 297 { 298 throw new SeqIOException("Problem parsing GenBank record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e); 299 } 300 301 if (! locusLineFound) 302 { 303 throw new SeqFormatException("No LOCUS line detected in the GenBank record!"); 304 } 305 306 return mCurrentSeq; 307 } 308 309 //--------------------------------------------------------------------------- 310 public String write(Collection<T> inSeqs) 311 throws SeqIOException 312 { 313 StringWriter writer = new StringWriter(); 314 for (T seq : inSeqs) 315 { 316 write(seq, writer); 317 } 318 319 return writer.toString(); 320 } 321 322 //--------------------------------------------------------------------------- 323 public String write(T inSeq) 324 throws SeqIOException 325 { 326 StringWriter writer = new StringWriter(); 327 328 write(inSeq, writer); 329 330 return writer.toString(); 331 } 332 333 //--------------------------------------------------------------------------- 334 public void write(T inSeq, OutputStream inStream) 335 throws SeqIOException 336 { 337 Writer writer = new OutputStreamWriter(inStream); 338 write(inSeq, writer); 339 try 340 { 341 writer.flush(); 342 } 343 catch (Exception e) 344 { 345 throw new SeqIOException(e); 346 } 347 } 348 349 //--------------------------------------------------------------------------- 350 public void write(T inSeq, Writer inWriter) 351 throws SeqIOException 352 { 353 Reader seqReader = null; 354 BufferedWriter writer = null; 355 try 356 { 357 try 358 { 359 if (writer instanceof BufferedWriter) 360 { 361 writer = (BufferedWriter) inWriter; 362 } else 363 { 364 writer = new BufferedWriter(inWriter, 8196); 365 } 366 367 // Write the LOCUS line 368 writeLocus(inSeq, writer); 369 370 // Write the DEFINTION line(s) 371 writeDefinition(inSeq, writer); 372 373 // Write the ACCESSION line 374 writeAccession(inSeq, writer); 375 376 // Write the VERSION line 377 writeVersion(inSeq, writer); 378 379 // TODO: SOURCE 380 381 if (inSeq instanceof BioSequencePlus) 382 { 383 BioSequencePlus seqPlus = (BioSequencePlus) inSeq; 384 385 if (CollectionUtil.hasValues(seqPlus.getDbXrefs())) 386 { 387 writeDBLinks(seqPlus.getDbXrefs(), writer); 388 } 389 390 if (CollectionUtil.hasValues(seqPlus.getReferences())) 391 { 392 writeReferences(seqPlus.getReferences(), writer); 393 } 394 395 // Write features 396 if (CollectionUtil.hasValues(seqPlus.getFeatures())) 397 { 398 writer.write(GenBankKeyword.FEATURES + " Location/Qualifiers\n"); 399 for (SeqFeature seqFeature : seqPlus.getFeatures()) 400 { 401 writeFeature(seqFeature, writer); 402 } 403 } 404 } 405 406 // Write the sequence lines 407 writer.write(GenBankKeyword.ORIGIN + "\n"); 408 409 seqReader = inSeq.getSequenceReader(); 410 411 int bufferSize = 60; 412 char[] buffer = new char[bufferSize]; 413 int residueNum = 1; 414 int numBytesRead; 415 416 while ((numBytesRead = seqReader.read(buffer)) != -1) 417 { 418 if (numBytesRead < bufferSize) 419 { 420 int secondNumBytesRead = seqReader.read(buffer, numBytesRead, buffer.length - numBytesRead); 421 if (secondNumBytesRead != -1) 422 { 423 numBytesRead += secondNumBytesRead; 424 } 425 } 426 427 writer.write(String.format("%9d", residueNum)); 428 for (int i = 0; i < numBytesRead; i += 10) 429 { 430 writer.write(" "); 431 writer.write(buffer, i, i + 10 > numBytesRead ? numBytesRead - i: 10); 432 } 433 434 writer.write("\n"); 435 436 437 residueNum += numBytesRead; 438 } 439 440 // Write end of record line 441 writer.write("//\n"); 442 } 443 finally 444 { 445 if (seqReader != null) 446 { 447 seqReader.close(); 448 } 449 450 if (writer != null) 451 { 452 writer.flush(); 453 } 454 } 455 } 456 catch (SeqIOException e) 457 { 458 throw e; 459 } 460 catch (Exception e) 461 { 462 throw new SeqIOException(e); 463 } 464 } 465 466 //########################################################################### 467 // PRIVATE METHODS 468 //########################################################################### 469 470 //--------------------------------------------------------------------------- 471 private void initRecordParsing() 472 { 473 mCurrentSeq = null; 474 mCurrentKeyword = null; 475 mCurrentSubkeyword = null; 476 mCurrentFeature = null; 477 mCurrentFeatureQualifier = null; 478 mCurrentReference = null; 479 } 480 481 //--------------------------------------------------------------------------- 482 // A keyword starts in column one and has a maximum of 10 characters. 483 private GenBankKeyword getLineKeyword(String inLine) 484 { 485 GenBankKeyword keyword = null; 486 if (Character.isLetter(inLine.charAt(0))) 487 { 488 String keywordString = (inLine.length() > 11 ? inLine.substring(0, 11) : inLine).trim(); 489 if (StringUtil.isSet(keywordString)) 490 { 491 keyword = GenBankKeyword.valueOf(keywordString); 492 if (null == keyword) 493 { 494 throw new InvalidGenBankKeywordException(StringUtil.singleQuote(keywordString) + " is not a recognized GenBank keyword!"); 495 } 496 } 497 } 498 499 return keyword; 500 } 501 502 //--------------------------------------------------------------------------- 503 // A subkeyword starts in column three and has a maximum of 8 characters. 504 private GenBankSubkeyword getLineSubkeyword(String inLine) 505 { 506 GenBankSubkeyword subkeyword = null; 507 if (Character.isWhitespace(inLine.charAt(0)) 508 && Character.isWhitespace(inLine.charAt(1))) 509 { 510 String subkeywordString = (inLine.length() > 11 ? inLine.substring(2, 11) : inLine).trim(); 511 if (StringUtil.isSet(subkeywordString)) 512 { 513 subkeyword = GenBankSubkeyword.valueOf(subkeywordString); 514 if (null == subkeyword) 515 { 516 throw new InvalidGenBankSubkeywordException(StringUtil.singleQuote(subkeywordString) + " is not a recognized GenBank subkeyword!"); 517 } 518 else if (! mCurrentKeyword.allowsSubkeyword(subkeyword)) 519 { 520 throw new InvalidGenBankSubkeywordException(StringUtil.singleQuote(subkeywordString) + " is not a recognized GenBank subkeyword of " + mCurrentKeyword + "!"); 521 } 522 } 523 } 524 525 return subkeyword; 526 } 527 528 //--------------------------------------------------------------------------- 529 private void finishPreviousKeyword() 530 throws ParseException 531 { 532 if (GenBankKeyword.DEFINITION.equals(mCurrentKeyword)) 533 { 534 finishDefinition(); 535 } 536 else if (GenBankKeyword.REFERENCE.equals(mCurrentKeyword)) 537 { 538 finishReference(); 539 } 540 else if (mCurrentFeatureQualifier != null) 541 { 542 // The last qualifier of the feature table may need unquoting if we just finished w/ FEATURES 543 if (mCurrentFeatureQualifier.getValue().startsWith("\"")) 544 { 545 mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue())); 546 } 547 mCurrentFeatureQualifier = null; 548 549 if (mCurrentSeq instanceof BioSequencePlus) 550 { 551 List<SeqFeature> sourceFeatures = ((BioSequencePlus) mCurrentSeq).getFeatures(GenBankFeatureKey.source); 552 if (CollectionUtil.hasValues(sourceFeatures)) 553 { 554 SeqFeature source = sourceFeatures.get(0); 555 556 List<FeatureQualifier> molTypeQualifiers = source.getQualifiers(GenBankFeatureQualifierName.mol_type.name()); 557 if (CollectionUtil.hasValues(molTypeQualifiers)) 558 { 559 MolType molType = MolType.valueOf(molTypeQualifiers.get(0).getValue()); 560 if (molType != null) 561 { 562 ((BioSequencePlus) mCurrentSeq).setMolType(molType); 563 } 564 } 565 566 // /db_xref="taxon:9606" 567 List<FeatureQualifier> dbXrefQualifiers = source.getQualifiers(GenBankFeatureQualifierName.db_xref.name()); 568 if (CollectionUtil.hasValues(dbXrefQualifiers)) 569 { 570 for (FeatureQualifier qualifier : dbXrefQualifiers) 571 { 572 String[] pieces = qualifier.getValue().split(":"); 573 if (pieces[0].equals("taxon")) 574 { 575 ((BioSequencePlus) mCurrentSeq).setNCBITaxon(NCBITaxon.getByTaxonId(Integer.parseInt(pieces[1]))); 576 break; 577 } 578 } 579 } 580 581 List<FeatureQualifier> cloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.clone.name()); 582 if (CollectionUtil.hasValues(cloneQualifiers)) 583 { 584 Clone clone = new Clone(cloneQualifiers.get(0).getValue()); 585 586 List<FeatureQualifier> subcloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.sub_clone.name()); 587 if (CollectionUtil.hasValues(subcloneQualifiers)) 588 { 589 clone.setSubcloneName(subcloneQualifiers.get(0).getValue()); 590 } 591 592 ((BioSequencePlus) mCurrentSeq).setClone(clone); 593 } 594 } 595 } 596 } 597 598 } 599 600 //--------------------------------------------------------------------------- 601 private void parseLine(String inLine) 602 throws Exception 603 { 604 GenBankKeyword keyword = getLineKeyword(inLine); 605 if (keyword != null) 606 { 607 // Found the start of a new keyword field 608 finishPreviousKeyword(); 609 610 mCurrentKeyword = keyword; 611 mCurrentSubkeyword = null; 612 613 parseField(inLine); 614 } 615 else 616 { 617 // Continuation of an existing field 618 if (GenBankKeyword.FEATURES.equals(mCurrentKeyword)) 619 { 620 // Features have a special set of feature keys 621 parseFeatures(inLine); 622 } 623 else 624 { 625 GenBankSubkeyword subkeyword = getLineSubkeyword(inLine); 626 if (subkeyword != null) 627 { 628 // Start of a new subfield 629 mCurrentSubkeyword = subkeyword; 630 } 631 else 632 { 633 // Continuation of an existing subfield 634 } 635 636 parseField(inLine); 637 } 638 } 639 } 640 641 //--------------------------------------------------------------------------- 642 private void parseField(String inLine) 643 throws Exception 644 { 645 if (mCurrentKeyword.equals(GenBankKeyword.LOCUS)) 646 { 647 parseLocus(inLine); 648 } 649 else if (mCurrentKeyword.equals(GenBankKeyword.DEFINITION)) 650 { 651 parseDefinition(inLine); 652 } 653 else if (mCurrentKeyword.equals(GenBankKeyword.VERSION)) 654 { 655 parseVersion(inLine); 656 } 657 else if (mCurrentKeyword.equals(GenBankKeyword.KEYWORDS)) 658 { 659 parseKeywords(inLine); 660 } 661 else if (mCurrentKeyword.equals(GenBankKeyword.SOURCE)) 662 { 663 parseSource(inLine); 664 } 665 else if (mCurrentKeyword.equals(GenBankKeyword.REFERENCE)) 666 { 667 parseReference(inLine); 668 } 669 else if (mCurrentKeyword.equals(GenBankKeyword.COMMENT)) 670 { 671 parseComment(inLine); 672 } 673 else if (mCurrentKeyword.equals(GenBankKeyword.DBLINK)) 674 { 675 parseDBLink(inLine); 676 } 677 else if (mCurrentKeyword.equals(GenBankKeyword.FEATURES)) 678 { 679 parseFeatures(inLine); 680 } 681 else if (mCurrentKeyword.equals(GenBankKeyword.CONTIG)) 682 { 683 parseContig(inLine); 684 } 685 686 // NID is skipped 687 // PROJECT is skipped 688 // SEGMENT is skipped 689 // BASE COUNT is skipped 690 } 691 692 //--------------------------------------------------------------------------- 693 // Parse the LOCUS keyword line 694 // Ex: 695 // LOCUS R88064 460 bp mRNA linear EST 16-AUG-1995 696 // LOCUS pDR000029812 7616 bp circular 697 // LOCUS vDR\365 8070 bp DNA circular 21-MAR-2011 698 // PairwiseSeqAligner aligner = new PairwiseSeqAligner(alignmentSettings); 699 700 // Although it isn't always followed exactly, the detailed format for the LOCUS line format is as follows: 701 // 702 // Positions Contents 703 // --------- -------- 704 // 01-05 'LOCUS' 705 // 06-12 spaces 706 // 13-28 Locus name 707 // 29-29 space 708 // 30-40 Length of sequence, right-justified 709 // 41-41 space 710 // 42-43 bp 711 // 44-44 space 712 // 45-47 spaces, ss- (single-stranded), ds- (double-stranded), or 713 // ms- (mixed-stranded) 714 // 48-53 NA, DNA, RNA, tRNA (transfer RNA), rRNA (ribosomal RNA), 715 // mRNA (messenger RNA), uRNA (small nuclear RNA). 716 // Left justified. 717 // 54-55 space 718 // 56-63 'linear' followed by two spaces, or 'circular' 719 // 64-64 space 720 // 65-67 The division code 721 // 68-68 space 722 // 69-79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 723 // 724 private void parseLocus(String inLine) 725 { 726 Matcher m = sLocusPattern.matcher(inLine); 727 if (! m.matches()) 728 { 729 throw new SeqFormatException("The " + GenBankKeyword.LOCUS + " line " + StringUtil.singleQuote(inLine) + " isn't in a recognized format!"); 730 } 731 732 if (StringUtil.isSet(m.group(1))) 733 { 734 mCurrentSeq.setID(m.group(1)); 735 } 736 737 mSeqLengthFromLocusLine = Integer.parseInt(m.group(2).trim()); 738 739 // TODO: Group 3 is the detailed sequence type 740 741 if (mCurrentSeq instanceof BioSequencePlus) 742 { 743 if (StringUtil.isSet(m.group(4))) 744 { 745 ((BioSequencePlus) mCurrentSeq).setSeqTopology(SeqTopology.valueOf(m.group(4))); 746 } 747 748 if (StringUtil.isSet(m.group(5))) 749 { 750 ((BioSequencePlus) mCurrentSeq).setSeqRepositoryDivision(NCBIGenBankDivision.valueOf(m.group(5))); 751 } 752 753 if (StringUtil.isSet(m.group(6))) 754 { 755 try 756 { 757 ((BioSequencePlus) mCurrentSeq).setRevisionDate(mDateFormat.parse(m.group(6))); 758 } 759 catch (ParseException e) 760 { 761 // TODO: Add to warnings? 762 System.err.println(e.getMessage()); 763 } 764 } 765 } 766 } 767 768 //--------------------------------------------------------------------------- 769 private String getAccession(T inSeq) 770 { 771 String acc = inSeq.getID(); 772 if (StringUtil.isSet(acc)) 773 { 774 int index = acc.indexOf("."); 775 if (index > 0) 776 { 777 acc = acc.substring(0, index); 778 } 779 } 780 else 781 { 782 acc = ""; 783 } 784 785 return acc; 786 } 787 788 //--------------------------------------------------------------------------- 789 private void writeLocus(T inSeq, Writer inWriter) 790 throws IOException 791 { 792 inWriter.write(String.format("%5s %-16.16s %11d %2.2s", 793 GenBankKeyword.LOCUS, 794 getAccession(inSeq), 795 inSeq.length(), 796 inSeq.getType().equals(BioSequenceType.PROTEIN) ? "aa" : "bp")); 797 798 799 if (inSeq instanceof BioSequencePlus) 800 { 801 BioSequencePlus bioSequencePlus = (BioSequencePlus) inSeq; 802 803 String molTypeString = ""; 804 if (bioSequencePlus.getMolType() != null) 805 { 806 if (bioSequencePlus.getMolType().equals(MolType.genomic_DNA) 807 || bioSequencePlus.getMolType().equals(MolType.unassigned_DNA) 808 || bioSequencePlus.getMolType().equals(MolType.other_DNA)) 809 { 810 molTypeString = "DNA"; 811 } 812 else if (bioSequencePlus.getMolType().equals(MolType.genomic_RNA) 813 || bioSequencePlus.getMolType().equals(MolType.transcribed_RNA) 814 || bioSequencePlus.getMolType().equals(MolType.unassigned_RNA) 815 || bioSequencePlus.getMolType().equals(MolType.other_RNA)) 816 { 817 molTypeString = "RNA"; 818 } 819 else if (bioSequencePlus.getMolType().equals(MolType.mRNA)) 820 { 821 molTypeString = "mRNA"; 822 } 823 else if (bioSequencePlus.getMolType().equals(MolType.tRNA)) 824 { 825 molTypeString = "tRNA"; 826 } 827 } 828 829 // TODO: 'ss-', 'ds-', or 'ms-' prefix for the mol type isn't parsed or output 830 inWriter.write(String.format(" %-6.6s %-8.8s %-3.3s %s", 831 molTypeString, 832 bioSequencePlus.getSeqTopology() != null ? bioSequencePlus.getSeqTopology() : "", 833 bioSequencePlus.getSeqRepositoryDivision() != null ? bioSequencePlus.getSeqRepositoryDivision().getCode() : "", 834 bioSequencePlus.getRevisionDate() != null ? mDateFormat.format(bioSequencePlus.getRevisionDate()).toUpperCase() : "" 835 )); 836 } 837 838 inWriter.write("\n"); 839 } 840 841 //--------------------------------------------------------------------------- 842 // Parse the DEFINITION keyword line 843 // Ex: 844 // DEFINITION ym87c11.r1 Soares adult brain N2b4HB55Y Homo sapiens cDNA clone 845 // IMAGE:165908 5', mRNA sequence. 846 // 847 private void parseDefinition(String inLine) 848 { 849 String field = inLine.substring(12).trim(); 850 851 if (null == mCurrentSeq.getDescription()) 852 { 853 mCurrentSeq.setDescription(field); 854 } 855 else 856 { 857 mCurrentSeq.setDescription(mCurrentSeq.getDescription() + " " + field); 858 } 859 } 860 861 //--------------------------------------------------------------------------- 862 // Remove the trailing period 863 private void finishDefinition() 864 { 865 String definition = mCurrentSeq.getDescription(); 866 if (definition != null 867 && definition.endsWith(".")) 868 { 869 mCurrentSeq.setDescription(definition.substring(0, definition.length() - 1)); 870 } 871 } 872 873 //--------------------------------------------------------------------------- 874 private void writeDefinition(T inSeq, Writer inWriter) 875 throws IOException 876 { 877 if (StringUtil.isSet(inSeq.getDescription())) 878 { 879 String description = inSeq.getDescription(); 880 if (! description.endsWith(".")) 881 { 882 description += "."; 883 } 884 885 String[] lines = StringUtil.lines(StringUtil.wrap(description, 67)); 886 for (int i = 0; i < lines.length; i++) 887 { 888 inWriter.write(String.format("%-10s %s\n", 889 0 == i ? GenBankKeyword.DEFINITION : "", 890 lines[i])); 891 } 892 } 893 } 894 895 //--------------------------------------------------------------------------- 896 private void writeAccession(T inSeq, Writer inWriter) 897 throws IOException 898 { 899 if (StringUtil.isSet(inSeq.getID())) 900 { 901 inWriter.write(String.format("%-9.9s %s\n", 902 GenBankKeyword.ACCESSION, 903 getAccession(inSeq))); 904 } 905 } 906 907 //--------------------------------------------------------------------------- 908 private void writeVersion(T inSeq, Writer inWriter) 909 throws IOException 910 { 911 if (StringUtil.isSet(inSeq.getID())) 912 { 913 inWriter.write(String.format("%-9.9s %s\n", 914 GenBankKeyword.VERSION, 915 inSeq.getID())); 916 } 917 } 918 919 //--------------------------------------------------------------------------- 920 // Parse the VERSION keyword line 921 // Ex: 922 // VERSION AF181452.1 GI:6017929 923 // ^^^^^^^^^^ ^^^^^^^^^^ 924 // Compound NCBI GI 925 // Accession Identifier 926 // Number 927 // 928 private void parseVersion(String inLine) 929 { 930 String fields[] = inLine.substring(12).trim().split("\\s+"); 931 932 mCurrentSeq.setID(fields[0]); 933 934 if (fields.length > 1) 935 { 936 mCurrentSeq.setAttribute(NCBI_GI_ATTR, fields[1]); 937 } 938 } 939 940 //--------------------------------------------------------------------------- 941 // Parse the KEYWORDS keyword line 942 // Ex: 943 // KEYWORDS EST. 944 // 945 private void parseKeywords(String inLine) 946 { 947 String field = inLine.substring(12).trim(); 948 949 if (field.endsWith(".")) 950 { 951 field = field.substring(0, field.length() - 1); 952 } 953 954 if (mCurrentSeq instanceof BioSequencePlus) 955 { 956 ((BioSequencePlus) mCurrentSeq).addKeywords(field.split(",\\s+")); 957 } 958 } 959 960 //--------------------------------------------------------------------------- 961 // Parse the SOURCE keyword line 962 // Ex: 963 // SOURCE Homo sapiens (human) 964 // ORGANISM Homo sapiens 965 // Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; 966 // Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; 967 // Catarrhini; Hominidae; Homo. 968 // 969 // The SOURCE field consists of two parts. The first part is found after 970 // the SOURCE keyword and contains free-format information including an 971 // abbreviated form of the organism name followed by a molecule type; 972 // multiple lines are allowed, but the last line must end with a period. 973 // The second part consists of information found after the ORGANISM 974 // subkeyword. The formal scientific name for the source organism (genus 975 // and species, where appropriate) is found on the same line as ORGANISM. 976 // The records following the ORGANISM line list the taxonomic 977 // classification levels, separated by semicolons and ending with a 978 // period. 979 // 980 private void parseSource(String inLine) 981 { 982 if (GenBankSubkeyword.ORGANISM.equals(mCurrentSubkeyword)) 983 { 984 String field = inLine.substring(12).trim(); 985 // For now, just keep the first line with the scientific name 986 if (inLine.trim().startsWith(GenBankSubkeyword.ORGANISM.name()) 987 && mCurrentSeq instanceof BioSequencePlus) 988 { 989 Set<NCBITaxon> taxons = NCBITaxon.getByName(field); 990 if (CollectionUtil.hasValues(taxons)) 991 { 992 NCBITaxon taxon; 993 if (taxons.size() > 1) 994 { 995 // TODO: Refine with a better way to choose 996 997 // Choose the one with highest id 998 List<NCBITaxon> sortedTaxons = new ArrayList<>(taxons); 999 Collections.sort(sortedTaxons); 1000 taxon = sortedTaxons.get(sortedTaxons.size() - 1); 1001 } 1002 else 1003 { 1004 taxon = taxons.iterator().next(); 1005 } 1006 1007 ((BioSequencePlus) mCurrentSeq).setNCBITaxon(taxon); 1008 } 1009 } 1010 } 1011 } 1012 1013 //--------------------------------------------------------------------------- 1014 // Parse the REFERENCE keyword line 1015 // Ex: 1016 // REFERENCE 1 (bases 1 to 342) 1017 // AUTHORS Giachino,C., Padovan,E. and Lanzavecchia,A. 1018 // TITLE kappa+lambda+ dual receptor B cells are present in the human 1019 // peripheral repertoire 1020 // JOURNAL J. Exp. Med. 181 (3), 1245-1250 (1995) 1021 // PUBMED 7869042 1022 // 1023 // Publications by the authors of the sequence that discuss the data reported in 1024 // the record. References are automatically sorted within the record based on date 1025 // of publication, showing the oldest references first. 1026 // 1027 // Some sequences have not been reported in papers and show a status of "unpublished" 1028 // or "in press". When an accession number and/or sequence data has appeared in print, 1029 // sequence authors should send the complete citation of the article to update@ncbi.nlm.nih.gov 1030 // and the GenBank staff will revise the record. 1031 // 1032 // Various classes of publication can be present in the References field, including 1033 // journal article, book chapter, book, thesis/monograph, proceedings chapter, proceedings 1034 // from a meeting, and patent. 1035 // 1036 // The last citation in the REFERENCE field usually contains information about the 1037 // submitter of the sequence, rather than a literature citation. It is therefore 1038 // called the "submitter block" and shows the words "Direct Submission" instead of 1039 // an article title. Additional information is provided below, under the header Direct 1040 // Submission. Some older records do not contain a submitter block. 1041 private void parseReference(String inLine) 1042 throws ParseException 1043 { 1044 if (inLine.startsWith(GenBankKeyword.REFERENCE.name())) 1045 { 1046 // REFERENCE 2 (bases 1 to 200000) 1047 mCurrentReference = new SeqCitation(); 1048 // TODO: Set the reference seq location 1049 1050 if (mCurrentSeq instanceof BioSequencePlus) 1051 { 1052 ((BioSequencePlus) mCurrentSeq).addReference(mCurrentReference); 1053 } 1054 1055 Matcher m = sReferenceLocationPattern.matcher(inLine); 1056 if (m.find()) 1057 { 1058 mCurrentReference.setSeqLocation(new SeqLocation(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2)))); 1059 } 1060 } 1061 else 1062 { 1063 String field = inLine.substring(12).trim(); 1064 1065 if (GenBankSubkeyword.AUTHORS.equals(mCurrentSubkeyword)) 1066 { 1067 // Remove trailing comma if present. 1068 if (field.endsWith(",")) 1069 { 1070 field = field.substring(0, field.length() - 1); 1071 } 1072 else if (field.endsWith(" and")) 1073 { 1074 field = field.substring(0, field.length() - 4); 1075 } 1076 1077 if (! field.equals(".")) 1078 { 1079 String[] authorStrings = field.split("(,\\s+|\\s+and\\s+)"); 1080 for (String authorString : authorStrings) 1081 { 1082 mCurrentReference.addAuthor(new Author(authorString)); 1083 } 1084 } 1085 } 1086 else if (GenBankSubkeyword.TITLE.equals(mCurrentSubkeyword)) 1087 { 1088 String title = field; 1089 if (mCurrentReference.getTitle() != null) 1090 { 1091 title = mCurrentReference.getTitle() + " " + title; 1092 } 1093 1094 mCurrentReference.setTitle(title); 1095 } 1096 else if (GenBankSubkeyword.JOURNAL.equals(mCurrentSubkeyword)) 1097 { 1098 mCurrentReference.appendRawContent(field); 1099 1100 String journal = field; 1101 if (mCurrentReference.getJournal() != null) 1102 { 1103 journal = mCurrentReference.getJournal().getTitle() + " " + journal; 1104 } 1105 1106 mCurrentReference.setJournal(new Journal(journal)); 1107 } 1108 else if (GenBankSubkeyword.PUBMED.equals(mCurrentSubkeyword)) 1109 { 1110 mCurrentReference.setPubMedId(field); 1111 } 1112 else if (GenBankSubkeyword.REMARK.equals(mCurrentSubkeyword)) 1113 { 1114 String remark = field; 1115 if (mCurrentReference.getRemark() != null) 1116 { 1117 remark = mCurrentReference.getRemark() + " " + remark; 1118 } 1119 1120 mCurrentReference.setRemark(remark); 1121 } 1122 } 1123 } 1124 1125 //--------------------------------------------------------------------------- 1126 private void finishReference() 1127 throws ParseException 1128 { 1129 // Refine the citation based on the JOURNAL content 1130 if (null == mCurrentReference.getType() 1131 || mCurrentReference.getType().equals(CitationType.journal)) 1132 { 1133 // "Patent: (\\w{2} \\S+)\\s+(\\d+)\\s+(\\d{2}-\\w{3}-\\d{4});(.+)?" 1134 Matcher m = sReferencePatentPattern.matcher(mCurrentReference.getJournal().getTitle()); 1135 if (m.matches()) 1136 { 1137 // It's not really a journal. It's a patent. 1138 mCurrentReference.setType(CitationType.patent); 1139 PatentData patentData = new PatentData() 1140 .setTitle(mCurrentReference.getTitle()) 1141 .setInventors(mCurrentReference.getAuthors()) 1142 .setPublicationNum(m.group(1)) 1143 .setSeqIdNum(Integer.parseInt(m.group(2))) 1144 .setPublicationDate(mDateFormat.parse(m.group(3))); 1145 1146 if (m.group(4) != null) 1147 { 1148 patentData.setApplicants(parsePatentApplicants(m.group(4))); 1149 } 1150 1151 mCurrentReference.setPatentData(patentData); 1152 mCurrentReference.setJournal(null); 1153 } 1154 else 1155 { 1156 // If it's a direct submission of sequences, extract the submission date and contact info 1157 m = sReferenceDirectSubmissionPattern.matcher(mCurrentReference.getJournal().getTitle()); 1158 if (m.matches()) 1159 { 1160 mCurrentReference.setSubmissionDate(DateUtil.threadsafeParse(m.group(1), sDateFormat)); 1161 mCurrentReference.setYear(Integer.parseInt(m.group(2))); 1162 mCurrentReference.setContactInfo(m.group(3)); 1163 mCurrentReference.setJournal(null); 1164 } 1165 else 1166 { 1167 // Is it a thesis? 1168 m = sReferenceThesisPattern.matcher(mCurrentReference.getJournal().getTitle()); 1169 if (m.matches()) 1170 { 1171 mCurrentReference.setType(CitationType.thesis); 1172 1173 String yearString = m.group(1); 1174 if (yearString != null) 1175 { 1176 mCurrentReference.setYear(Integer.parseInt(yearString)); 1177 } 1178 1179 mCurrentReference.setInstitution(m.group(2)); 1180 mCurrentReference.setJournal(null); 1181 } 1182 else if (mCurrentReference.getJournal().getTitle().startsWith("Published Only in Database")) 1183 { 1184 mCurrentReference.setType(CitationType.online_database); 1185 mCurrentReference.setJournal(null); 1186 } 1187 else 1188 { 1189 m = sReferenceJournalPattern.matcher(mCurrentReference.getJournal().getTitle()); 1190 if (m.matches()) 1191 { 1192 // OK, it's a journal. Fill out the rest of the journal-related fields. 1193 1194 mCurrentReference.setType(CitationType.journal); 1195 mCurrentReference.setJournal(new Journal(m.group(1))); 1196 mCurrentReference.setVolume(m.group(2)); 1197 mCurrentReference.setIssue(m.group(3)); 1198 1199 // Group 4 might be pages or an article number 1200 String pages = m.group(4); 1201 if (pages != null 1202 && pages.toUpperCase().startsWith("E")) 1203 { 1204 mCurrentReference.setArticleNumber(pages); 1205 } 1206 else 1207 { 1208 mCurrentReference.setPages(pages); 1209 } 1210 1211 String year = m.group(5); 1212 if (year != null) 1213 { 1214 mCurrentReference.setYear(Integer.parseInt(year)); 1215 } 1216 } 1217 } 1218 } 1219 } 1220 } 1221 } 1222 1223 //--------------------------------------------------------------------------- 1224 // Because of a lack of format controls, it's nearly impossible to parse this 1225 // content correctly. The best approach would be some sort of NLP. 1226 // This method is protected instead of private to allow unit testing. 1227 protected List<String> parsePatentApplicants(String inApplicantString) 1228 { 1229 List<String> applicants = new ArrayList<>(3); 1230 1231 // Values should be separated by semi-colons 1232 if (inApplicantString.contains(";")) 1233 { 1234 String[] pieces = inApplicantString.split(";"); 1235 for (int i = 0; i < pieces.length; i++) 1236 { 1237 pieces[i] = pieces[i].trim(); 1238 } 1239 1240 // Ends with a location? 1241 if (pieces.length > 1 1242 && pieces.length <= 3 1243 && (2 == pieces[pieces.length - 1].length() // Ends in a two letter country code? 1244 || sPatentLocationPattern.matcher(pieces[pieces.length - 1]).matches() // ', \\w{2}' 1245 || ! pieces[pieces.length - 1].contains(" "))) // Last piece is a single word (probably a city name) 1246 { 1247 applicants.add(StringUtil.join(pieces,", ")); 1248 } 1249 else 1250 { 1251 for (String piece : pieces) 1252 { 1253 applicants.add(piece); 1254 } 1255 } 1256 } 1257 else 1258 { // Sometimes the values are separated by commas 1259 String[] pieces = inApplicantString.split(","); 1260 for (String piece : pieces) 1261 { 1262 piece = piece.trim(); 1263 if (piece.length() > 0) 1264 { 1265 String ucPiece = piece.toUpperCase(); 1266 1267 // Is it a single word or a company suffix? 1268 if (applicants.size() > 0 1269 && (! piece.contains(" ") 1270 || ucPiece.startsWith("INC ") 1271 || ucPiece.startsWith("INC. ") 1272 || ucPiece.startsWith("LLC ") 1273 || (sPatentParensLocationPattern.matcher(ucPiece).matches() 1274 && applicants.size() > 0 1275 && ! sPatentParensLocationPattern.matcher(applicants.get(applicants.size() - 1)).matches()))) 1276 { 1277 // Add it to the previous piece 1278 int lastIndex = applicants.size() - 1; 1279 applicants.set(lastIndex, applicants.get(lastIndex) + ", " + piece); 1280 } 1281 else 1282 { 1283 applicants.add(piece.trim()); 1284 } 1285 } 1286 } 1287 } 1288 1289 return applicants; 1290 } 1291 1292 //--------------------------------------------------------------------------- 1293 // Parse the COMMENT keyword line 1294 // Ex: 1295 // COMMENT Contact: Wilson RK 1296 // Washington University School of Medicine 1297 // 4444 Forest Park Parkway, Box 8501, St. Louis, MO 63108 1298 // Tel: 314 286 1800 1299 // Fax: 314 286 1810 1300 // Email: est@watson.wustl.edu 1301 // Insert Size: 1482 1302 // High quality sequence stops: 353 Source: IMAGE Consortium, LLNL 1303 // This clone is available royalty-free through LLNL ; contact the 1304 // IMAGE Consortium (info@image.llnl.gov) for further information. 1305 // Insert Length: 1482 Std Error: 0.00 1306 // Seq primer: M13RP1 1307 // High quality sequence stop: 353. 1308 // 1309 private void parseComment(String inLine) 1310 { 1311 String field = inLine.substring(12).trim(); 1312 1313 if (mCurrentSeq.getAttribute(COMMENT_ATTR) != null) 1314 { 1315 mCurrentSeq.setAttribute(COMMENT_ATTR, mCurrentSeq.getAttribute(COMMENT_ATTR) + "\n" + field); 1316 } 1317 else 1318 { 1319 mCurrentSeq.setAttribute(COMMENT_ATTR, field); 1320 } 1321 } 1322 1323 //--------------------------------------------------------------------------- 1324 // Parse the DBLINK keyword line 1325 // Ex: 1326 // DBLINK BioProject:PRJNA174162,PRJNA999998,PRJNA999999 1327 // BioSample: SAMN01795900 1328 // 1329 // "This line contains cross-references to other underlying resources that 1330 // support the existence of a GenBank sequence record... 1331 // A DBLINK cross-reference consists of two data fields delimited by a colon. 1332 // The first field provides the cross-reference type ("BioProject"), while the 1333 // second contains the actual cross-reference identifier ("PRJNA177352"). 1334 // The second field can consist of multiple comma-separated identifiers, 1335 // if a sequence record has multiple DBLINK cross-references of a given type." 1336 // 1337 private void parseDBLink(String inLine) 1338 { 1339 if (mCurrentSeq instanceof BioSequencePlus) 1340 { 1341 BioSequencePlus sequencePlus = (BioSequencePlus) mCurrentSeq; 1342 String field = inLine.substring(12).trim(); 1343 String[] pieces = field.split(":"); 1344 if (2 == pieces.length) 1345 { 1346 String[] values = pieces[1].split(","); 1347 for (String value : values) 1348 { 1349 sequencePlus.addDbXref(new DbXref(pieces[0].trim(), value.trim())); 1350 } 1351 } 1352 else 1353 { 1354 // Continuation of previous db identifiers 1355 String db = sequencePlus.getDbXrefs().get(sequencePlus.getDbXrefs().size() - 1).getDB(); 1356 String[] values = field.split(","); 1357 for (String value : values) 1358 { 1359 sequencePlus.addDbXref(new DbXref(db, value.trim())); 1360 } 1361 } 1362 } 1363 } 1364 1365 //--------------------------------------------------------------------------- 1366 private void writeDBLinks(List<DbXref> inDBXrefs, Writer inWriter) 1367 throws IOException 1368 { 1369 Map<String, StringBuilderPlus> xRefMap = new OrderedMap<>(4); 1370 1371 for (DbXref xref : inDBXrefs) 1372 { 1373 StringBuilderPlus line = xRefMap.get(xref.getDB()); 1374 if (null == line) 1375 { 1376 line = new StringBuilderPlus(xref.getDB() + ":" + xref.getId()); 1377 xRefMap.put(xref.getDB(), line); 1378 } 1379 else 1380 { 1381 line.delimitedAppend(xref.getId()); 1382 } 1383 } 1384 1385 int count = 0; 1386 for (String db : xRefMap.keySet()) 1387 { 1388 String[] lines = StringUtil.lines(StringUtil.wrap(xRefMap.get(db).toString(), 67)); 1389 1390 for (String line : lines) 1391 { 1392 count++; 1393 inWriter.write(String.format("%-12.12s%s", (1 == count ? GenBankKeyword.DBLINK : ""), line)); 1394 } 1395 } 1396 } 1397 1398 //--------------------------------------------------------------------------- 1399 // Parse the CONTIG keyword line 1400 // Ex: 1401 // CONTIG join(D86993.1:7160..39752,D87004.2:803..13993) 1402 // 1403 private void parseContig(String inLine) 1404 { 1405 String field = inLine.substring(12).trim(); 1406 1407 if (mCurrentSeq.getAttribute(CONTIG_ATTR) != null) 1408 { 1409 mCurrentSeq.setAttribute(CONTIG_ATTR, mCurrentSeq.getAttribute(CONTIG_ATTR) + field); 1410 } 1411 else 1412 { 1413 mCurrentSeq.setAttribute(CONTIG_ATTR, field); 1414 } 1415 } 1416 1417 //--------------------------------------------------------------------------- 1418 private void parseFeatures(String inLine) 1419 { 1420 if (! inLine.startsWith(GenBankKeyword.FEATURES.name())) 1421 { 1422 // Is there a feature key on this line? 1423 String featureKeyString = inLine.substring(5, 20).trim(); 1424 if (StringUtil.isSet(featureKeyString)) 1425 { 1426 GenBankFeatureKey featureKey = GenBankFeatureKey.valueOf(featureKeyString); 1427 if (null == featureKey) 1428 { 1429 throw new SeqFormatException(StringUtil.singleQuote(featureKeyString) + " is not a recognized feature key!"); 1430 } 1431 1432 String locationString = inLine.substring(21).trim(); 1433 mCurrentFeature = new GenBankFeature(featureKey, new GenBankFeatureLocation(locationString)); 1434 if (mCurrentSeq instanceof BioSequencePlus) 1435 { 1436 ((BioSequencePlus) mCurrentSeq).addFeature(mCurrentFeature); 1437 } 1438 1439 // Unquote the previous qualifier if necessary 1440 if (mCurrentFeatureQualifier != null 1441 && mCurrentFeatureQualifier.getValue().startsWith("\"")) 1442 { 1443 mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue())); 1444 } 1445 mCurrentFeatureQualifier = null; 1446 } 1447 else 1448 { 1449 String content = inLine.substring(21).trim(); 1450 1451 Matcher m = sFeatureQualifierPattern.matcher(content); 1452 if (m.matches()) 1453 { 1454 // New qualifier 1455 1456 // Unquote the previous qualifier if necessary 1457 if (mCurrentFeatureQualifier != null 1458 && mCurrentFeatureQualifier.getValue().startsWith("\"")) 1459 { 1460 mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue())); 1461 } 1462 1463 GenBankFeatureQualifierName qualifierName = GenBankFeatureQualifierName.valueOf(m.group(1)); 1464 if (null == qualifierName) 1465 { 1466 throw new SeqFormatException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!"); 1467 } 1468 1469 mCurrentFeatureQualifier = new GenBankFeatureQualifier(qualifierName); 1470 mCurrentFeature.addQualifier(mCurrentFeatureQualifier); 1471 1472 String value = m.group(2); 1473 if (value != null) 1474 { 1475 mCurrentFeatureQualifier.appendToValue(value); 1476 } 1477 } 1478 else if (mCurrentFeatureQualifier != null) 1479 { 1480 // Continuation of a previous qualifier 1481 mCurrentFeatureQualifier.appendToValue(content); 1482 } 1483 else if (mCurrentFeature != null) 1484 { 1485 // Continuation of a feature location 1486 mCurrentFeature.getLocation().append(content); 1487 } 1488 } 1489 } 1490 } 1491 1492 //--------------------------------------------------------------------------- 1493 private void writeReferences(List<SeqCitation> inSeqCitations, Writer inWriter) 1494 throws IOException 1495 { 1496 int count = 0; 1497 for (SeqCitation citation : inSeqCitations) 1498 { 1499 count++; 1500 1501 inWriter.write(String.format("%s %-3d%s\n", 1502 GenBankKeyword.REFERENCE, 1503 count, 1504 citation.getSeqLocation() != null ? "(bases " + citation.getSeqLocation().getStart() + " to " + citation.getSeqLocation().getEnd() + ")" : "")); 1505 1506 // Authors 1507 StringBuilderPlus authors = new StringBuilderPlus().setDelimiter(", "); 1508 for (int i = 0; i < citation.getAuthors().size(); i++) 1509 { 1510 Author author = citation.getAuthors().get(i); 1511 if (citation.getAuthors().size() > 1 1512 && i == citation.getAuthors().size() - 1) 1513 { 1514 authors.append(" and " + author.getLastName() + "," + author.getFirstInitial() + "."); 1515 } 1516 else 1517 { 1518 authors.delimitedAppend(author.getLastName() + "," + author.getFirstInitial() + "."); 1519 } 1520 } 1521 String[] lines = StringUtil.lines(StringUtil.wrap(authors.toString(), 67)); 1522 for (int i = 0; i < lines.length; i++) 1523 { 1524 inWriter.write(String.format(" %-7.7s %s\n", 1525 0 == i ? GenBankSubkeyword.AUTHORS.name() : "", 1526 lines[i])); 1527 } 1528 1529 // Title 1530 lines = StringUtil.lines(StringUtil.wrap(citation.getTitle(), 67)); 1531 for (int i = 0; i < lines.length; i++) 1532 { 1533 inWriter.write(String.format(" %-5.5s %s\n", 1534 0 == i ? GenBankSubkeyword.TITLE.name() : "", 1535 lines[i])); 1536 } 1537 1538 // Journal 1539 // JOURNAL J. Exp. Med. 188 (11), 2151-2162 (1998) 1540 StringBuilderPlus journal = new StringBuilderPlus(); 1541 if (StringUtil.isSet(citation.toString())) 1542 { 1543 journal.append(citation.toString()); 1544 } 1545 else 1546 { 1547 journal.append(citation.getJournal()) 1548 .append(".") 1549 .append(citation.getVolume() != null ? " " + citation.getVolume() : "") 1550 .append(citation.getIssue() != null ? " (" + citation.getIssue() + ")" : "") 1551 .append(citation.getPages() != null ? ", " + citation.getPages() : "") 1552 .append(citation.getYear() != null ? " (" + citation.getYear() + ")" : ""); 1553 } 1554 1555 lines = StringUtil.lines(StringUtil.wrap(journal.toString(), 67)); 1556 for (int i = 0; i < lines.length; i++) 1557 { 1558 inWriter.write(String.format(" %-7.7s %s\n", 1559 0 == i ? GenBankSubkeyword.JOURNAL.name() : "", 1560 lines[i])); 1561 } 1562 1563 // Pubmed id 1564 if (StringUtil.isSet(citation.getPubMedId())) 1565 { 1566 inWriter.write(String.format(" %-6.6s %s\n", 1567 GenBankSubkeyword.PUBMED.name(), 1568 citation.getPubMedId())); 1569 } 1570 1571 // Remark 1572 if (StringUtil.isSet(citation.getRemark())) 1573 { 1574 lines = StringUtil.lines(StringUtil.wrap(citation.getRemark(), 67)); 1575 for (int i = 0; i < lines.length; i++) 1576 { 1577 inWriter.write(String.format(" %-6.6s %s\n", 1578 0 == i ? GenBankSubkeyword.REMARK.name() : "", 1579 lines[i])); 1580 } 1581 } 1582 1583 } 1584 } 1585 1586 //--------------------------------------------------------------------------- 1587 private void writeFeature(SeqFeature inSeqFeature, Writer inWriter) 1588 throws IOException 1589 { 1590 1591 String[] lines = StringUtil.lines(StringUtil.wrap(inSeqFeature.getLocation().toString(), 58)); 1592 for (int i = 0; i < lines.length; i++) 1593 { 1594 inWriter.write(String.format(" %-15.15s %s\n", 1595 0 == i ? inSeqFeature.name() : "", 1596 lines[i])); 1597 } 1598 1599 if (CollectionUtil.hasValues(inSeqFeature.getQualifiers())) 1600 { 1601 for (FeatureQualifier qualifier : inSeqFeature.getQualifiers()) 1602 { 1603 String qualifierString = "/" + qualifier.name(); 1604 if (StringUtil.isSet(qualifier.getValue())) 1605 { 1606 qualifierString += "=\"" + qualifier.getValue() + "\""; 1607 } 1608 1609 lines = StringUtil.lines(StringUtil.wrap(qualifierString, 58)); 1610 for (int i = 0; i < lines.length; i++) 1611 { 1612 inWriter.write(String.format(" %s\n", lines[i])); 1613 } 1614 } 1615 } 1616 } 1617 1618 //########################################################################### 1619 // INNER CLASS 1620 //########################################################################### 1621 1622 class GenBankSeqFilterReader extends LettersOnlyReader 1623 { 1624 //--------------------------------------------------------------------------- 1625 public GenBankSeqFilterReader(Reader inReader) 1626 { 1627 super(inReader); 1628 } 1629 1630 //--------------------------------------------------------------------------- 1631 @Override 1632 public int read() 1633 throws IOException 1634 { 1635 int returnChar; 1636 1637 do 1638 { 1639 returnChar = innerRead(); 1640 } 1641 while (returnChar >= 0 1642 && (Character.isWhitespace(returnChar) 1643 || Character.isDigit(returnChar) 1644 || returnChar == '/')); 1645 1646 return returnChar; 1647 } 1648 } 1649 1650}