001package com.hfg.bio.seq.format; 002 003import java.io.BufferedReader; 004import java.io.IOException; 005import java.io.Reader; 006import java.util.List; 007import java.util.logging.Level; 008import java.util.logging.Logger; 009import java.util.regex.Matcher; 010import java.util.regex.Pattern; 011 012import com.hfg.bio.DbXref; 013import com.hfg.bio.seq.BioSequence; 014import com.hfg.bio.seq.BioSequenceFactory; 015import com.hfg.bio.seq.BioSequencePlus; 016import com.hfg.bio.seq.Clone; 017import com.hfg.bio.seq.SeqLocation; 018import com.hfg.bio.seq.SeqTopology; 019import com.hfg.bio.seq.format.feature.FeatureQualifier; 020import com.hfg.bio.seq.format.feature.SeqFeature; 021import com.hfg.bio.seq.format.feature.genbank.GenBankFeature; 022import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureKey; 023import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureLocation; 024import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureQualifier; 025import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureQualifierName; 026import com.hfg.bio.seq.format.feature.qualifier.MolType; 027import com.hfg.bio.seq.format.feature.uniprot.UniProtFeature; 028import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureKey; 029import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureLocation; 030import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureQualifier; 031import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureQualifierName; 032import com.hfg.bio.taxonomy.uniprot.EMBL_TaxonDivision; 033import com.hfg.bio.taxonomy.ncbi.NCBITaxon; 034import com.hfg.citation.Author; 035import com.hfg.util.StringBuilderPlus; 036import com.hfg.util.StringUtil; 037import com.hfg.util.collection.CollectionUtil; 038import com.hfg.util.io.LettersOnlyReader; 039 040//------------------------------------------------------------------------------ 041/** 042 EMBL sequence format. 043 <div> 044 See <a href='ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt'>ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt</a> 045 </div> 046 <div> 047 See <a href='http://web.expasy.org/docs/userman.html'>http://web.expasy.org/docs/userman.html</a> for info on the Uniprot format variant. 048 </div> 049 050 <div> 051 @author J. Alex Taylor, hairyfatguy.com 052 </div> 053 */ 054//------------------------------------------------------------------------------ 055// com.hfg Library 056// 057// This library is free software; you can redistribute it and/or 058// modify it under the terms of the GNU Lesser General Public 059// License as published by the Free Software Foundation; either 060// version 2.1 of the License, or (at your option) any later version. 061// 062// This library is distributed in the hope that it will be useful, 063// but WITHOUT ANY WARRANTY; without even the implied warranty of 064// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 065// Lesser General Public License for more details. 066// 067// You should have received a copy of the GNU Lesser General Public 068// License along with this library; if not, write to the Free Software 069// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 070// 071// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 072// jataylor@hairyfatguy.com 073//------------------------------------------------------------------------------ 074 075/* 076Example record from ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt : 077 078ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 079XX 080AC X56734; S46826; 081XX 082DT 12-SEP-1991 (Rel. 29, Created) 083DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 084XX 085DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 086XX 087KW beta-glucosidase. 088XX 089OS Trifolium repens (white clover) 090OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 091OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 092OC fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 093XX 094RN [5] 095RP 1-1859 096RX DOI; 10.1007/BF00039495. 097RX PUBMED; 1907511. 098RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 099RT "Nucleotide and derived amino acid sequence of the cyanogenic 100RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 101RL Plant Mol. Biol. 17(2):209-219(1991). 102XX 103RN [6] 104RP 1-1859 105RA Hughes M.A.; 106RT ; 107RL Submitted (19-NOV-1990) to the INSDC. 108RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 109RL Upon Tyne, NE2 4HH, UK 110XX 111DR EuropePMC; PMC99098; 11752244. 112XX 113FH Key Location/Qualifiers 114FH 115FT source 1..1859 116FT /organism="Trifolium repens" 117FT /mol_type="mRNA" 118FT /clone_lib="lambda gt10" 119FT /clone="TRE361" 120FT /tissue_type="leaves" 121FT /db_xref="taxon:3899" 122FT mRNA 1..1859 123FT /experiment="experimental evidence, no additional details 124FT recorded" 125FT CDS 14..1495 126FT /product="beta-glucosidase" 127FT /EC_number="3.2.1.21" 128FT /note="non-cyanogenic" 129FT /db_xref="GOA:P26204" 130FT /db_xref="InterPro:IPR001360" 131FT /db_xref="InterPro:IPR013781" 132FT /db_xref="InterPro:IPR017853" 133FT /db_xref="InterPro:IPR018120" 134FT /db_xref="UniProtKB/Swiss-Prot:P26204" 135FT /protein_id="CAA40058.1" 136FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 137FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 138FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 139FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 140FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 141FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 142FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 143FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 144FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 145XX 146SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 147 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 148 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 149 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 150 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 151 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 152 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 153 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 154 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 155 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 156 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 157 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 158 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 159 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 160 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 161 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 162 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 163 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 164 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 165 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 166 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 167 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 168 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 169 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 170 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 171 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 172 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 173 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 174 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 175 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 176 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 177 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 178// 179 Figure 1 - A sample entry from the database 180*/ 181public class EMBL<T extends BioSequence> extends ReadableSeqFormatBase<T> 182{ 183 184 // Variables used during parsing 185 private T mCurrentSeq; 186 private FormatVariant mFormatVariant; 187 private String mCurrentLineCode; 188 private GenBankFeature mCurrentGenBankFeature; 189 private GenBankFeatureQualifier mCurrentGenBankFeatureQualifier; 190 private UniProtFeature mCurrentUniProtFeature; 191 private UniProtFeatureQualifier mCurrentUniProtFeatureQualifier; 192 private SeqCitation mCurrentCitation; 193 private Integer mSeqLengthFromIdLine; 194 195 private String mCurrentKeywords; 196 197 // Valid line codes 198 public static final String AC_LINE_CODE = "AC"; // AC - accession number (>=1 per entry) 199 public static final String AH_LINE_CODE = "AH"; // AH - assembly header (0 or 1 per entry) 200 public static final String AS_LINE_CODE = "AS"; // AS - assembly information (0 or >=1 per entry) 201 public static final String CC_LINE_CODE = "CC"; // CC - comments or notes (>=0 per entry) 202 public static final String CO_LINE_CODE = "CO"; // CO - contig/construct line (0 or >=1 per entry) 203 public static final String DE_LINE_CODE = "DE"; // DE - description (>=1 per entry) 204 public static final String DR_LINE_CODE = "DR"; // DR - database cross-reference (>=0 per entry) 205 public static final String DT_LINE_CODE = "DT"; // DT - date (2 per entry) 206 public static final String FH_LINE_CODE = "FH"; // FH - feature table header (2 per entry) 207 public static final String FT_LINE_CODE = "FT"; // FT - feature table data (>=2 per entry) 208 public static final String ID_LINE_CODE = "ID"; // ID - identification (begins each entry; 1 per entry) 209 public static final String KW_LINE_CODE = "KW"; // KW - keyword (>=1 per entry) 210 public static final String OC_LINE_CODE = "OC"; // OC - organism classification (>=1 per entry) 211 public static final String OG_LINE_CODE = "OG"; // OG - organelle (0 or 1 per entry) 212 public static final String OS_LINE_CODE = "OS"; // OS - organism species (>=1 per entry) 213 public static final String PR_LINE_CODE = "PR"; // PR - project identifier (0 or 1 per entry) 214 public static final String RA_LINE_CODE = "RA"; // RA - reference author(s) (>=0 per entry) 215 public static final String RC_LINE_CODE = "RC"; // RC - reference comment (>=0 per entry) 216 public static final String RG_LINE_CODE = "RG"; // RG - reference group (>=0 per entry) 217 public static final String RL_LINE_CODE = "RL"; // RL - reference location (>=1 per entry) 218 public static final String RN_LINE_CODE = "RN"; // RN - reference number (>=1 per entry) 219 public static final String RP_LINE_CODE = "RP"; // RP - reference positions (>=1 per entry) 220 public static final String RT_LINE_CODE = "RT"; // RT - reference title (>=1 per entry) 221 public static final String RX_LINE_CODE = "RX"; // RX - reference cross-reference (>=0 per entry) 222 public static final String SQ_LINE_CODE = "SQ"; // SQ - sequence header (1 per entry) 223 // EMBL-specific line codes 224 public static final String XX_LINE_CODE = "XX"; // XX - spacer line (many per entry) 225 // UniProt-specific line codes 226 public static final String GN_LINE_CODE = "GN"; // 227 public static final String OH_LINE_CODE = "OH"; // Organism host taxonomy cross-reference 228 public static final String OX_LINE_CODE = "OX"; // Organism taxonomy cross-reference 229 public static final String PE_LINE_CODE = "PE"; // 230 231 // Attributes populated into the sequence object 232 public static final String CLONE_ATTR = "Clone"; 233 public static final String EMBL_DATA_CLASS_ATTR = "EMBL Data Class"; 234 public static final String SPECIES_SCIENTIFIC_NAME_ATTR = "Species Scientific Name"; 235 public static final String ORGANISM_CLASSIFICATION_ATTR = "Organism Classification"; 236 public static final String ORGANISM_NCBI_TAXON_ID_ATTR = "Organism NCBI Taxon ID"; 237 238 239 enum FormatVariant 240 { 241 EMBL, 242 UniProt 243 } 244 245 private final static Logger LOGGER = Logger.getLogger(GenBank.class.getName()); 246 247 static 248 { 249 LOGGER.setLevel(Level.WARNING); 250 LOGGER.setUseParentHandlers(true); 251 } 252 253 //########################################################################### 254 // CONSTRUCTORS 255 //########################################################################### 256 257 //--------------------------------------------------------------------------- 258 public EMBL(BioSequenceFactory<T> inSeqFactory) 259 { 260 super(inSeqFactory); 261 } 262 263 //########################################################################### 264 // PUBLIC METHODS 265 //########################################################################### 266 267 //--------------------------------------------------------------------------- 268 public static Logger getLogger() 269 { 270 return LOGGER; 271 } 272 273 //--------------------------------------------------------------------------- 274 public boolean isEndOfRecord(String inLine) 275 { 276 return inLine.trim().equals("//"); 277 } 278 279 //--------------------------------------------------------------------------- 280 public boolean hasJanusDelimiter() 281 { 282 return false; 283 } 284 285 //--------------------------------------------------------------------------- 286 public T readRecord(BufferedReader inReader) 287 throws SeqIOException 288 { 289 initRecordParsing(); 290 291 int lineCount = 0; 292 int maxPreIdLines = 50; 293 boolean idLineFound = false; 294 295 try 296 { 297 mCurrentSeq = getBioSequenceFactory().createSeqObj(); 298 299 String line; 300 while ((line = inReader.readLine()) != null) 301 { 302 lineCount++; 303 304 if (! idLineFound) 305 { 306 if (lineCount > maxPreIdLines) 307 { 308 throw new SeqFormatException("No EMBL ID line found within " + maxPreIdLines + " lines of the start!"); 309 } 310 311 if (line.length() > 2) 312 { 313 String lineCode = line.substring(0, 2); 314 if (ID_LINE_CODE.equals(lineCode)) 315 { 316 idLineFound = true; 317 } 318 else 319 { 320 continue; 321 } 322 } 323 } 324 else if (isEndOfRecord(line)) 325 { 326 break; 327 } 328 329 if (idLineFound 330 && StringUtil.isSet(line)) 331 { 332 parseLine(line); 333 334 if (SQ_LINE_CODE.equals(mCurrentLineCode)) 335 { 336 break; 337 } 338 } 339 } 340 341 if (! idLineFound) 342 { 343 throw new SeqFormatException("No EMBL ID line found!"); 344 } 345 346 // The rest of the record is assumed to be sequence 347 348 // Cleanup the sequence to remove spaces and numbers 349 Reader filterReader = new SeqFilterReader(inReader); 350 mCurrentSeq.setSequence(filterReader); 351 352 filterReader.close(); 353 354 // Cleanup 355 356 if (mCurrentGenBankFeatureQualifier != null) 357 { 358 // The last qualifier of the feature table may need unquoting if we just finished w/ FEATURES 359 if (mCurrentGenBankFeatureQualifier.getValue().startsWith("\"")) 360 { 361 mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue())); 362 } 363 mCurrentGenBankFeatureQualifier = null; 364 365 if (mCurrentSeq instanceof BioSequencePlus) 366 { 367 List<SeqFeature> sourceFeatures = ((BioSequencePlus) mCurrentSeq).getFeatures(GenBankFeatureKey.source); 368 if (CollectionUtil.hasValues(sourceFeatures)) 369 { 370 SeqFeature source = sourceFeatures.get(0); 371 List<FeatureQualifier> cloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.clone.name()); 372 if (CollectionUtil.hasValues(cloneQualifiers)) 373 { 374 Clone clone = new Clone(cloneQualifiers.get(0).getValue()); 375 376 List<FeatureQualifier> subcloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.sub_clone.name()); 377 if (CollectionUtil.hasValues(subcloneQualifiers)) 378 { 379 clone.setSubcloneName(subcloneQualifiers.get(0).getValue()); 380 } 381 382 mCurrentSeq.setAttribute(CLONE_ATTR, clone); 383 } 384 } 385 } 386 } 387 388 // TODO: Clean the '.' off the end of feature descriptions 389 } 390 catch (SeqFormatException e) 391 { 392 throw new SeqFormatException("Problem parsing EMBL record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e); 393 } 394 catch (Exception e) 395 { 396 throw new SeqIOException("Problem parsing EMBL record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e); 397 } 398 399 if (! idLineFound) 400 { 401 throw new SeqFormatException("No " + ID_LINE_CODE + " line detected in the EMBL record!"); 402 } 403 404 return mCurrentSeq; 405 } 406 407 //########################################################################### 408 // PRIVATE METHODS 409 //########################################################################### 410 411 //--------------------------------------------------------------------------- 412 private void initRecordParsing() 413 { 414 mCurrentSeq = null; 415 mCurrentLineCode = null; 416 mCurrentUniProtFeature = null; 417 mCurrentCitation = null; 418 mCurrentKeywords = ""; 419 } 420 421 //--------------------------------------------------------------------------- 422 private void parseLine(String inLine) 423 { 424 mCurrentLineCode = inLine.substring(0, 2); 425 switch (mCurrentLineCode) 426 { 427 case XX_LINE_CODE: 428 // Blank line 429 break; 430 case CC_LINE_CODE: 431 // Comment line 432 break; 433 case ID_LINE_CODE: 434 parseID(inLine); 435 break; 436 case AC_LINE_CODE: 437 // Accessions 438 parseAC(inLine); 439 break; 440 case DT_LINE_CODE: 441 // Date 442 //TODO: parseDT(inLine); 443 break; 444 case DE_LINE_CODE: 445 // Description 446 parseDE(inLine); 447 break; 448 case KW_LINE_CODE: 449 // Keywords 450 parseKW(inLine); 451 break; 452 case OS_LINE_CODE: 453 // Organism species 454 parseOS(inLine); 455 break; 456 case OC_LINE_CODE: 457 // Organism classification 458 parseOC(inLine); 459 break; 460 case RN_LINE_CODE: 461 // Reference number (start of a new reference) 462 mCurrentCitation = new SeqCitation(); 463 if (mCurrentSeq instanceof BioSequencePlus) 464 { 465 ((BioSequencePlus) mCurrentSeq).addReference(mCurrentCitation); 466 } 467 break; 468 case RA_LINE_CODE: 469 // Reference author(s) 470 parseRA(inLine); 471 break; 472 case RT_LINE_CODE: 473 // Reference title 474 parseRT(inLine); 475 break; 476 case RX_LINE_CODE: 477 // Reference cross-reference 478 parseRX(inLine); 479 break; 480 case RP_LINE_CODE: 481 // Reference positions 482 parseRP(inLine); 483 break; 484 case RL_LINE_CODE: 485 // Reference location 486 parseRL(inLine); 487 break; 488 case RC_LINE_CODE: 489 // Reference comment 490 //TODO 491 break; 492 case DR_LINE_CODE: 493 // Database cross-reference 494 parseDR(inLine); 495 break; 496 case AH_LINE_CODE: 497 case AS_LINE_CODE: 498 // Assembly info 499 // TODO 500 break; 501 case FH_LINE_CODE: 502 // Feature table header. Ignore 503 break; 504 case FT_LINE_CODE: 505 // Features 506 parseFT(inLine); 507 break; 508 case SQ_LINE_CODE: 509 // Sequence data 510 // TODO 511 break; 512 case GN_LINE_CODE: 513 if (mFormatVariant != FormatVariant.UniProt) 514 { 515 throw new SeqFormatException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!"); 516 } 517 break; 518 case OX_LINE_CODE: 519 if (mFormatVariant != FormatVariant.UniProt) 520 { 521 throw new SeqFormatException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!"); 522 } 523 parseOX(inLine); 524 break; 525 case PE_LINE_CODE: 526 if (mFormatVariant != FormatVariant.UniProt) 527 { 528 throw new SeqFormatException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!"); 529 } 530 break; 531 default: 532 throw new SeqFormatException("Unrecognized line code: " + StringUtil.quote(mCurrentLineCode) + "!"); 533 } 534/* 535 536 GenBankKeyword keyword = getLineKeyword(inLine); 537 if (keyword != null) 538 { 539 // Found the start of a new keyword field 540 finishPreviousKeyword(); 541 542 mCurrentLineCode = keyword; 543 mCurrentSubkeyword = null; 544 545 parseField(inLine); 546 } 547 else 548 { 549 // Continuation of an existing field 550 if (GenBankKeyword.FEATURES.equals(mCurrentLineCode)) 551 { 552 // Features have a special set of feature keys 553 parseFeatures(inLine); 554 } 555 else 556 { 557 mCurrentSubkeyword = getLineSubkeyword(inLine); 558 if (mCurrentSubkeyword != null) 559 { 560 // Start of a new subfield 561 } 562 else 563 { 564 // Continuation of an existing subfield 565 } 566 567 parseField(inLine); 568 } 569 } 570*/ 571 } 572 573 //--------------------------------------------------------------------------- 574 // Parse the ID line 575 // Ex #1: ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 576 // Ex #2: ID FMT_ANADE Reviewed; 312 AA. 577 // 578 // The ID (IDentification) line is always the first line of an entry. The 579 // format of the ID line is: 580 // ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP. 581 // The tokens represent: 582 // 1. Primary accession number 583 // 2. Sequence version number 584 // 3. Topology: 'circular' or 'linear' 585 // 4. Molecule type (see note 1 below) 586 // 5. Data class (see section 3.1) 587 // 6. Taxonomic division (see section 3.2) 588 // 7. Sequence length (see note 2 below) 589 private static final Pattern sEMBL_ID_Pattern1 = Pattern.compile(ID_LINE_CODE + "\\s+(\\w+);\\s+SV\\s+(\\w+);\\s+(\\w+);\\s+(.+?);\\s+(\\w+);\\s+(\\w+);\\s+(\\w+) BP."); 590 private static final Pattern sUniProt_ID_Pattern1 = Pattern.compile(ID_LINE_CODE + "\\s+(\\w+)\\s+(?:Reviewed|Unreviewd);\\s+(\\w+) AA."); 591 private static final Pattern sSeqLengthPattern = Pattern.compile("(\\w+) (AA|BP)."); 592 593 private void parseID(String inLine) 594 { 595 Matcher m = sEMBL_ID_Pattern1.matcher(inLine); 596 if (m.matches()) 597 { 598 mFormatVariant = FormatVariant.EMBL; 599 600 if (StringUtil.isSet(m.group(1))) 601 { 602 mCurrentSeq.setID(m.group(1) + '.' + m.group(2)); 603 } 604 605 if (mCurrentSeq instanceof BioSequencePlus) 606 { 607 if (m.group(3) != null) 608 { 609 ((BioSequencePlus) mCurrentSeq).setSeqTopology(SeqTopology.valueOf(m.group(3))); 610 } 611 612 if (m.group(4) != null) 613 { 614 ((BioSequencePlus) mCurrentSeq).setMolType(MolType.retrieveOrCreateValueOf(m.group(4))); 615 } 616 617 mCurrentSeq.setAttribute(EMBL_DATA_CLASS_ATTR, EMBL_DataClass.valueOf(m.group(5))); 618 619 ((BioSequencePlus) mCurrentSeq).setSeqRepositoryDivision(EMBL_TaxonDivision.valueOf(m.group(6))); 620 } 621 622 mSeqLengthFromIdLine = Integer.parseInt(m.group(7)); 623 } 624 else 625 { 626 m = sUniProt_ID_Pattern1.matcher(inLine); 627 if (m.matches()) 628 { 629 mFormatVariant = FormatVariant.UniProt; 630 } 631 else 632 { 633 // Non-standard ID line format. Do the best we can. 634 String[] pieces = inLine.substring(2).split(";"); 635 636 int pieceIndex = 1; 637 638 String id = pieces[0].trim().split("\\s+")[0].trim(); 639 if (pieces.length > 1 640 && pieces[1].trim().startsWith("SV")) 641 { 642 id += "." + pieces[1].trim().substring(2).trim(); 643 pieceIndex++; 644 } 645 646 mCurrentSeq.setID(id); 647 648 if (mCurrentSeq instanceof BioSequencePlus) 649 { 650 BioSequencePlus bioSequencePlus = (BioSequencePlus) mCurrentSeq; 651 652 boolean topologyFound = false; 653 boolean molTypeFound = false; 654 boolean dataClassFound = false; 655 boolean divisionFound = false; 656 657 for (; pieceIndex < pieces.length; pieceIndex++) 658 { 659 String field = pieces[pieceIndex].trim(); 660 661 if (! topologyFound) 662 { 663 SeqTopology seqTopology = SeqTopology.valueOf(field); 664 if (seqTopology != null) 665 { 666 bioSequencePlus.setSeqTopology(seqTopology); 667 topologyFound = true; 668 continue; 669 } 670 } 671 672 if (! molTypeFound) 673 { 674 MolType molType = MolType.valueOf(field); 675 if (molType != null) 676 { 677 bioSequencePlus.setMolType(MolType.valueOf(field)); 678 molTypeFound = true; 679 continue; 680 } 681 } 682 683 if (! dataClassFound) 684 { 685 EMBL_DataClass dataClass = EMBL_DataClass.valueOf(field); 686 if (dataClass != null) 687 { 688 mCurrentSeq.setAttribute(EMBL_DATA_CLASS_ATTR, dataClass); 689 dataClassFound = true; 690 continue; 691 } 692 } 693 694 if (! divisionFound) 695 { 696 EMBL_TaxonDivision div = EMBL_TaxonDivision.valueOf(field); 697 if (div != null) 698 { 699 bioSequencePlus.setSeqRepositoryDivision(div); 700 divisionFound = true; 701 continue; 702 } 703 } 704 705 if (pieceIndex == pieces.length - 1) 706 { 707 Matcher seqLengthMatcher = sSeqLengthPattern.matcher(field); 708 if (seqLengthMatcher.matches()) 709 { 710 mSeqLengthFromIdLine = Integer.parseInt(seqLengthMatcher.group(1)); 711 if (seqLengthMatcher.group(2).equalsIgnoreCase("BP")) 712 { 713 mFormatVariant = FormatVariant.EMBL; 714 } 715 else 716 { 717 mFormatVariant = FormatVariant.UniProt; 718 } 719 } 720 } 721 } 722 } 723 } 724 } 725 } 726 727 //--------------------------------------------------------------------------- 728 private void parseAC(String inLine) 729 { 730 String[] pieces = inLine.substring(2).split(";\\s*"); 731 732 if (null == mCurrentSeq.getID()) 733 { 734 mCurrentSeq.setID(pieces[0].trim()); 735 } 736 737 // TODO: handle additional ids 738 } 739 740 //--------------------------------------------------------------------------- 741 private void parseDE(String inLine) 742 { 743 String description = inLine.substring(2).trim(); 744 if (StringUtil.isSet(mCurrentSeq.getDescription())) 745 { 746 description = mCurrentSeq.getDescription() + " " + description; 747 } 748 749 mCurrentSeq.setDescription(description); 750 } 751 752 //--------------------------------------------------------------------------- 753 // Parse keywords 754 private void parseKW(String inLine) 755 { 756 String keywordString = inLine.substring(2).trim(); 757 758 if (mCurrentSeq instanceof BioSequencePlus) 759 { 760 mCurrentKeywords += keywordString; 761 762 if (mCurrentKeywords.endsWith(".")) 763 { 764 String[] keywords = mCurrentKeywords.split("[;\\.]"); 765 for (String keyword : keywords) 766 { 767 if (StringUtil.isSet(keyword)) 768 { 769 ((BioSequencePlus) mCurrentSeq).addKeyword(keyword.trim()); 770 } 771 } 772 } 773 } 774 } 775 776 //--------------------------------------------------------------------------- 777 // Parse the reference authors 778 private void parseRA(String inLine) 779 { 780 String[] authors = null; 781 if (mFormatVariant.equals(FormatVariant.EMBL)) 782 { 783 authors = inLine.substring(2).split(";\\s*"); 784 } 785 else if (mFormatVariant.equals(FormatVariant.UniProt)) 786 { 787 authors = inLine.substring(2).split(",\\s*"); 788 } 789 790 if (authors != null) 791 { 792 for (String author : authors) 793 { 794 if (StringUtil.isSet(author)) 795 { 796 mCurrentCitation.addAuthor(new Author(author.trim())); 797 } 798 } 799 } 800 } 801 802 //--------------------------------------------------------------------------- 803 // Parse the reference title 804 private void parseRT(String inLine) 805 { 806 String title = inLine.substring(2).trim(); 807 if (title.endsWith(";")) 808 { 809 title = title.substring(0, title.length() - 1); 810 } 811 812 if (StringUtil.isSet(title)) 813 { 814 if (mCurrentCitation.getTitle() != null) 815 { 816 title = mCurrentCitation.getTitle() + " " + title; 817 } 818 819 if (StringUtil.isQuoted(title)) 820 { 821 title = StringUtil.unquote(title); 822 } 823 824 mCurrentCitation.setTitle(title); 825 } 826 } 827 828 //--------------------------------------------------------------------------- 829 private void parseRX(String inLine) 830 { 831 String xref = inLine.substring(2).trim(); 832 if (xref.endsWith(".")) 833 { 834 xref = xref.substring(0, xref.length() - 1); 835 } 836 837 String[] pieces = xref.split(";\\s*"); 838 839 if (pieces[0].equals("PUBMED")) 840 { 841 mCurrentCitation.setPubMedId(pieces[1]); 842 } 843 844 // TODO: handle other x-refs 845 } 846 847 //--------------------------------------------------------------------------- 848 // Parse reference position value 849 private void parseRP(String inLine) 850 { 851 String positionString = inLine.substring(2).trim(); 852 if (StringUtil.isSet(positionString)) 853 { 854 String[] positionStrings = positionString.split("\\s*-\\s*"); 855 if (2 == positionStrings.length) 856 { 857 mCurrentCitation.setSeqLocation(new SeqLocation().setStart(Integer.parseInt(positionStrings[0].trim())).setEnd(Integer.parseInt(positionStrings[1].trim()))); 858 } 859 } 860 } 861 862 //--------------------------------------------------------------------------- 863 // Parse reference location 864 private void parseRL(String inLine) 865 { 866 String stringValue = inLine.substring(2).trim(); 867 if (StringUtil.isSet(stringValue)) 868 { 869 mCurrentCitation.appendRawContent(stringValue); 870 } 871 } 872 873 //--------------------------------------------------------------------------- 874 private void parseOS(String inLine) 875 { 876 String text = inLine.substring(2).trim(); 877 878 if (null == mCurrentSeq.getAttribute(SPECIES_SCIENTIFIC_NAME_ATTR)) 879 { 880 mCurrentSeq.setAttribute(SPECIES_SCIENTIFIC_NAME_ATTR, text); 881 } 882 else 883 { 884 mCurrentSeq.setAttribute(SPECIES_SCIENTIFIC_NAME_ATTR, mCurrentSeq.getAttribute(SPECIES_SCIENTIFIC_NAME_ATTR) + " " + text); 885 } 886 } 887 888 //--------------------------------------------------------------------------- 889 private void parseOC(String inLine) 890 { 891 String text = inLine.substring(2).trim(); 892 893 if (null == mCurrentSeq.getAttribute(ORGANISM_CLASSIFICATION_ATTR)) 894 { 895 mCurrentSeq.setAttribute(ORGANISM_CLASSIFICATION_ATTR, text); 896 } 897 else 898 { 899 mCurrentSeq.setAttribute(ORGANISM_CLASSIFICATION_ATTR, mCurrentSeq.getAttribute(ORGANISM_CLASSIFICATION_ATTR) + " " + text); 900 } 901 } 902 903 904 //--------------------------------------------------------------------------- 905 // Parse organism taxonomy cross-reference 906 // OX Taxonomy_database_Qualifier=Taxonomic code; 907 // Example: 908 // OX NCBI_TaxID=9606; 909 private void parseOX(String inLine) 910 { 911 String[] pieces = inLine.substring(2).trim().split("="); 912 913 // Remove trailing ';' 914 if (pieces[1].endsWith(";")) 915 { 916 pieces[1] = pieces[1].substring(0, pieces[1].length() - 1); 917 } 918 919 mCurrentSeq.setAttribute(ORGANISM_NCBI_TAXON_ID_ATTR, Integer.parseInt(pieces[1])); 920 if (mCurrentSeq instanceof BioSequencePlus) 921 { 922 NCBITaxon taxon = NCBITaxon.getByTaxonId(Integer.parseInt(pieces[1])); 923 if (taxon != null) 924 { 925 ((BioSequencePlus) mCurrentSeq).setNCBITaxon(taxon); 926 } 927 } 928 } 929 930 931 //--------------------------------------------------------------------------- 932 // Parse database cross-reference 933 // The format of the DR line is: 934 // DR RESOURCE_ABBREVIATION; RESOURCE_IDENTIFIER; OPTIONAL_INFORMATION_1[; OPTIONAL_INFORMATION_2][; OPTIONAL_INFORMATION_3]. 935 // Example: 936 // DR EMBL; U29082; AAA68403.1; -; Genomic_DNA. 937 private void parseDR(String inLine) 938 { 939 String[] pieces = inLine.substring(2).trim().split(";\\s*"); 940 941 // Remove trailing '.' from the last piece 942 if (pieces[pieces.length - 1].endsWith(".")) 943 { 944 pieces[pieces.length - 1] = pieces[pieces.length - 1].substring(0, pieces[pieces.length - 1].length() - 1); 945 } 946 947 DbXref xref = new DbXref(pieces[0], pieces[1]); 948 if (pieces.length > 2) 949 { 950 StringBuilderPlus description = new StringBuilderPlus().setDelimiter("; "); 951 for (int i = 2; i < pieces.length; i++) 952 { 953 description.delimitedAppend(pieces[i]); 954 } 955 xref.setDescription(description.toString()); 956 } 957 958 if (mCurrentSeq instanceof BioSequencePlus) 959 { 960 ((BioSequencePlus) mCurrentSeq).addDbXref(xref); 961 } 962 } 963 964 965 966 //--------------------------------------------------------------------------- 967 // Parse feature 968 private void parseFT(String inLine) 969 { 970 if (mFormatVariant.equals(FormatVariant.EMBL)) 971 { 972 pareGenBankFeatureTableLine(inLine); 973 } 974 else if (mFormatVariant.equals(FormatVariant.UniProt)) 975 { 976 pareUniProtFeatureTableLine(inLine); 977 } 978 } 979 980 private static final Pattern sGenBankFeatureQualifierPattern = Pattern.compile("/(\\S+?)(?:=(.+))?"); 981 982 //--------------------------------------------------------------------------- 983 private void pareGenBankFeatureTableLine(String inLine) 984 { 985 // Is there a feature key on this line? 986 String featureKeyString = inLine.substring(5, 20).trim(); 987 if (StringUtil.isSet(featureKeyString)) 988 { 989 GenBankFeatureKey featureKey = GenBankFeatureKey.valueOf(featureKeyString); 990 if (null == featureKey) 991 { 992 throw new SeqFormatException(StringUtil.singleQuote(featureKeyString) + " is not a recognized feature key!"); 993 } 994 995 String locationString = inLine.substring(21).trim(); 996 mCurrentGenBankFeature = new GenBankFeature(featureKey, new GenBankFeatureLocation(locationString)); 997 if (mCurrentSeq instanceof BioSequencePlus) 998 { 999 ((BioSequencePlus) mCurrentSeq).addFeature(mCurrentGenBankFeature); 1000 } 1001 1002 // Unquote the previous qualifier if necessary 1003 if (mCurrentGenBankFeatureQualifier != null 1004 && mCurrentGenBankFeatureQualifier.getValue().startsWith("\"")) 1005 { 1006 mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue())); 1007 } 1008 mCurrentGenBankFeatureQualifier = null; 1009 } 1010 else 1011 { 1012 String content = inLine.substring(21).trim(); 1013 1014 Matcher m = sGenBankFeatureQualifierPattern.matcher(content); 1015 if (m.matches()) 1016 { 1017 // New qualifier 1018 1019 // Unquote the previous qualifier if necessary 1020 if (mCurrentGenBankFeatureQualifier != null 1021 && mCurrentGenBankFeatureQualifier.getValue().startsWith("\"")) 1022 { 1023 mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue())); 1024 } 1025 1026 GenBankFeatureQualifierName qualifierName = GenBankFeatureQualifierName.valueOf(m.group(1)); 1027 if (null == qualifierName) 1028 { 1029 throw new SeqFormatException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!"); 1030 } 1031 1032 mCurrentGenBankFeatureQualifier = new GenBankFeatureQualifier(qualifierName); 1033 mCurrentGenBankFeature.addQualifier(mCurrentGenBankFeatureQualifier); 1034 1035 String value = m.group(2); 1036 if (value != null) 1037 { 1038 mCurrentGenBankFeatureQualifier.appendToValue(value); 1039 } 1040 } 1041 else if (mCurrentGenBankFeatureQualifier != null) 1042 { 1043 // Continuation of a previous qualifier 1044 mCurrentGenBankFeatureQualifier.appendToValue(content); 1045 } 1046 else if (mCurrentGenBankFeature != null) 1047 { 1048 // Continuation of a feature location 1049 mCurrentGenBankFeature.getLocation().append(content); 1050 } 1051 } 1052 } 1053 1054 1055 // Example UniProt FT entry: 1056 // FT CHAIN 1 312 Methionyl-tRNA formyltransferase. 1057 // FT /FTId=PRO_1000077286. 1058 1059 private static final Pattern sUniProt_FT_FirstLinePattern = Pattern.compile(FT_LINE_CODE + "\\s{1,5}(\\w+)\\s{1,10}([\\?\\<]?\\d*)\\s{1,10}([\\?\\>]?\\d*)\\s+(.+)"); 1060 private static final Pattern sUniProt_FT_AdditionalLinePattern = Pattern.compile(FT_LINE_CODE + "\\s{20,}(.+)"); 1061 1062 //--------------------------------------------------------------------------- 1063 private void pareUniProtFeatureTableLine(String inLine) 1064 { 1065 Matcher m = sUniProt_FT_FirstLinePattern.matcher(inLine); 1066 if (m.matches()) 1067 { 1068 UniProtFeatureKey featureKey = UniProtFeatureKey.valueOf(m.group(1)); 1069 UniProtFeatureLocation location = new UniProtFeatureLocation(m.group(2), m.group(3)); 1070 mCurrentUniProtFeature = new UniProtFeature(featureKey, location).setDescription(m.group(4)); 1071 if (mCurrentSeq instanceof BioSequencePlus) 1072 { 1073 ((BioSequencePlus) mCurrentSeq).addFeature(mCurrentUniProtFeature); 1074 } 1075 } 1076 else 1077 { 1078 m = sUniProt_FT_AdditionalLinePattern.matcher(inLine); 1079 if (m.matches()) 1080 { 1081 if (m.group(1).startsWith("/")) 1082 { 1083 String[] pieces = m.group(1).substring(1).split("="); 1084 if (2 != pieces.length) 1085 { 1086 throw new SeqFormatException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!"); 1087 } 1088 1089 UniProtFeatureQualifierName qualifierName = UniProtFeatureQualifierName.valueOf(pieces[0]); 1090 if (null == qualifierName) 1091 { 1092 throw new SeqFormatException(StringUtil.singleQuote(pieces[0]) + " is not a recognized qualifier name!"); 1093 } 1094 1095 mCurrentUniProtFeatureQualifier = new UniProtFeatureQualifier(qualifierName); 1096 mCurrentUniProtFeature.addQualifier(mCurrentUniProtFeatureQualifier); 1097 1098 // Trim trailing period 1099 if (pieces[1].endsWith(".")) 1100 { 1101 pieces[1] = pieces[1].substring(0, pieces[1].length() - 1); 1102 } 1103 mCurrentUniProtFeatureQualifier.appendToValue(pieces[1]); 1104 } 1105 else 1106 { 1107 mCurrentUniProtFeature.appendDescription(m.group(1)); 1108 } 1109 } 1110 } 1111 } 1112 1113 //########################################################################### 1114 // INNER CLASS 1115 //########################################################################### 1116 1117 class SeqFilterReader extends LettersOnlyReader 1118 { 1119 //--------------------------------------------------------------------------- 1120 public SeqFilterReader(Reader inReader) 1121 { 1122 super(inReader); 1123 } 1124 1125 //--------------------------------------------------------------------------- 1126 @Override 1127 public int read() 1128 throws IOException 1129 { 1130 int returnChar; 1131 1132 do 1133 { 1134 returnChar = innerRead(); 1135 } 1136 while (returnChar >= 0 1137 && (Character.isWhitespace(returnChar) 1138 || Character.isDigit(returnChar) 1139 || returnChar == '/')); 1140 1141 return returnChar; 1142 } 1143 } 1144 1145}