001package com.hfg.bio.seq.format;
002
003import java.io.BufferedReader;
004import java.io.IOException;
005import java.io.Reader;
006import java.util.List;
007import java.util.logging.Level;
008import java.util.logging.Logger;
009import java.util.regex.Matcher;
010import java.util.regex.Pattern;
011
012import com.hfg.bio.DbXref;
013import com.hfg.bio.seq.BioSequence;
014import com.hfg.bio.seq.BioSequenceFactory;
015import com.hfg.bio.seq.BioSequencePlus;
016import com.hfg.bio.seq.Clone;
017import com.hfg.bio.seq.SeqLocation;
018import com.hfg.bio.seq.SeqTopology;
019import com.hfg.bio.seq.format.feature.FeatureQualifier;
020import com.hfg.bio.seq.format.feature.SeqFeature;
021import com.hfg.bio.seq.format.feature.genbank.GenBankFeature;
022import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureKey;
023import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureLocation;
024import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureQualifier;
025import com.hfg.bio.seq.format.feature.genbank.GenBankFeatureQualifierName;
026import com.hfg.bio.seq.format.feature.qualifier.MolType;
027import com.hfg.bio.seq.format.feature.uniprot.UniProtFeature;
028import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureKey;
029import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureLocation;
030import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureQualifier;
031import com.hfg.bio.seq.format.feature.uniprot.UniProtFeatureQualifierName;
032import com.hfg.bio.taxonomy.uniprot.EMBL_TaxonDivision;
033import com.hfg.bio.taxonomy.ncbi.NCBITaxon;
034import com.hfg.citation.Author;
035import com.hfg.util.StringBuilderPlus;
036import com.hfg.util.StringUtil;
037import com.hfg.util.collection.CollectionUtil;
038import com.hfg.util.io.LettersOnlyReader;
039
040//------------------------------------------------------------------------------
041/**
042 EMBL sequence format.
043 <div>
044 See <a href='ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt'>ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt</a>
045 </div>
046 <div>
047 See <a href='http://web.expasy.org/docs/userman.html'>http://web.expasy.org/docs/userman.html</a> for info on the Uniprot format variant.
048 </div>
049
050 <div>
051 @author J. Alex Taylor, hairyfatguy.com
052 </div>
053 */
054//------------------------------------------------------------------------------
055// com.hfg Library
056//
057// This library is free software; you can redistribute it and/or
058// modify it under the terms of the GNU Lesser General Public
059// License as published by the Free Software Foundation; either
060// version 2.1 of the License, or (at your option) any later version.
061//
062// This library is distributed in the hope that it will be useful,
063// but WITHOUT ANY WARRANTY; without even the implied warranty of
064// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
065// Lesser General Public License for more details.
066//
067// You should have received a copy of the GNU Lesser General Public
068// License along with this library; if not, write to the Free Software
069// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
070//
071// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
072// jataylor@hairyfatguy.com
073//------------------------------------------------------------------------------
074
075/*
076Example record from ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt :
077
078ID   X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
079XX
080AC   X56734; S46826;
081XX
082DT   12-SEP-1991 (Rel. 29, Created)
083DT   25-NOV-2005 (Rel. 85, Last updated, Version 11)
084XX
085DE   Trifolium repens mRNA for non-cyanogenic beta-glucosidase
086XX
087KW   beta-glucosidase.
088XX
089OS   Trifolium repens (white clover)
090OC   Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
091OC   Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
092OC   fabids; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
093XX
094RN   [5]
095RP   1-1859
096RX   DOI; 10.1007/BF00039495.
097RX   PUBMED; 1907511.
098RA   Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
099RT   "Nucleotide and derived amino acid sequence of the cyanogenic
100RT   beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
101RL   Plant Mol. Biol. 17(2):209-219(1991).
102XX
103RN   [6]
104RP   1-1859
105RA   Hughes M.A.;
106RT   ;
107RL   Submitted (19-NOV-1990) to the INSDC.
108RL   Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
109RL   Upon Tyne, NE2 4HH, UK
110XX
111DR   EuropePMC; PMC99098; 11752244.
112XX
113FH   Key             Location/Qualifiers
114FH
115FT   source          1..1859
116FT                   /organism="Trifolium repens"
117FT                   /mol_type="mRNA"
118FT                   /clone_lib="lambda gt10"
119FT                   /clone="TRE361"
120FT                   /tissue_type="leaves"
121FT                   /db_xref="taxon:3899"
122FT   mRNA            1..1859
123FT                   /experiment="experimental evidence, no additional details
124FT                   recorded"
125FT   CDS             14..1495
126FT                   /product="beta-glucosidase"
127FT                   /EC_number="3.2.1.21"
128FT                   /note="non-cyanogenic"
129FT                   /db_xref="GOA:P26204"
130FT                   /db_xref="InterPro:IPR001360"
131FT                   /db_xref="InterPro:IPR013781"
132FT                   /db_xref="InterPro:IPR017853"
133FT                   /db_xref="InterPro:IPR018120"
134FT                   /db_xref="UniProtKB/Swiss-Prot:P26204"
135FT                   /protein_id="CAA40058.1"
136FT                   /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
137FT                   FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
138FT                   DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
139FT                   VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
140FT                   CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
141FT                   DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
142FT                   IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
143FT                   EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
144FT                   IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
145XX
146SQ   Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
147     aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt        60
148     cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag       120
149     tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga       180
150     aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata       240
151     tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta       300
152     caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc       360
153     ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa       420
154     atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct       480
155     ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg       540
156     tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt       600
157     gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg       660
158     aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac       720
159     aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta       780
160     taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg       840
161     gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga       900
162     cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg       960
163     gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg      1020
164     ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc      1080
165     acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa      1140
166     acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat      1200
167     gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct      1260
168     gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga      1320
169     agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg      1380
170     ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg      1440
171     taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga      1500
172     tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa      1560
173     ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt      1620
174     tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg      1680
175     aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc      1740
176     agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac      1800
177     tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa       1859
178//
179                  Figure 1 - A sample entry from the database
180*/
181public class EMBL<T extends BioSequence> extends ReadableSeqFormatBase<T>
182{
183
184   // Variables used during parsing
185   private T mCurrentSeq;
186   private FormatVariant mFormatVariant;
187   private String mCurrentLineCode;
188   private GenBankFeature mCurrentGenBankFeature;
189   private GenBankFeatureQualifier mCurrentGenBankFeatureQualifier;
190   private UniProtFeature mCurrentUniProtFeature;
191   private UniProtFeatureQualifier mCurrentUniProtFeatureQualifier;
192   private SeqCitation mCurrentCitation;
193   private Integer mSeqLengthFromIdLine;
194
195   private String mCurrentKeywords;
196
197   // Valid line codes
198   public static final String AC_LINE_CODE = "AC"; // AC - accession number           (>=1 per entry)
199   public static final String AH_LINE_CODE = "AH"; // AH - assembly header            (0 or 1 per entry)
200   public static final String AS_LINE_CODE = "AS"; // AS - assembly information       (0 or >=1 per entry)
201   public static final String CC_LINE_CODE = "CC"; // CC - comments or notes          (>=0 per entry)
202   public static final String CO_LINE_CODE = "CO"; // CO - contig/construct line      (0 or >=1 per entry)
203   public static final String DE_LINE_CODE = "DE"; // DE - description                (>=1 per entry)
204   public static final String DR_LINE_CODE = "DR"; // DR - database cross-reference   (>=0 per entry)
205   public static final String DT_LINE_CODE = "DT"; // DT - date                       (2 per entry)
206   public static final String FH_LINE_CODE = "FH"; // FH - feature table header       (2 per entry)
207   public static final String FT_LINE_CODE = "FT"; // FT - feature table data         (>=2 per entry)
208   public static final String ID_LINE_CODE = "ID"; // ID - identification             (begins each entry; 1 per entry)
209   public static final String KW_LINE_CODE = "KW"; // KW - keyword                    (>=1 per entry)
210   public static final String OC_LINE_CODE = "OC"; // OC - organism classification    (>=1 per entry)
211   public static final String OG_LINE_CODE = "OG"; // OG - organelle                  (0 or 1 per entry)
212   public static final String OS_LINE_CODE = "OS"; // OS - organism species           (>=1 per entry)
213   public static final String PR_LINE_CODE = "PR"; // PR - project identifier         (0 or 1 per entry)
214   public static final String RA_LINE_CODE = "RA"; // RA - reference author(s)        (>=0 per entry)
215   public static final String RC_LINE_CODE = "RC"; // RC - reference comment          (>=0 per entry)
216   public static final String RG_LINE_CODE = "RG"; // RG - reference group            (>=0 per entry)
217   public static final String RL_LINE_CODE = "RL"; // RL - reference location         (>=1 per entry)
218   public static final String RN_LINE_CODE = "RN"; // RN - reference number           (>=1 per entry)
219   public static final String RP_LINE_CODE = "RP"; // RP - reference positions        (>=1 per entry)
220   public static final String RT_LINE_CODE = "RT"; // RT - reference title            (>=1 per entry)
221   public static final String RX_LINE_CODE = "RX"; // RX - reference cross-reference  (>=0 per entry)
222   public static final String SQ_LINE_CODE = "SQ"; // SQ - sequence header            (1 per entry)
223   // EMBL-specific line codes
224   public static final String XX_LINE_CODE = "XX"; // XX - spacer line                (many per entry)
225   // UniProt-specific line codes
226   public static final String GN_LINE_CODE = "GN"; //
227   public static final String OH_LINE_CODE = "OH"; // Organism host taxonomy cross-reference
228   public static final String OX_LINE_CODE = "OX"; // Organism taxonomy cross-reference
229   public static final String PE_LINE_CODE = "PE"; //
230
231   // Attributes populated into the sequence object
232   public static final String CLONE_ATTR       = "Clone";
233   public static final String EMBL_DATA_CLASS_ATTR = "EMBL Data Class";
234   public static final String SPECIES_SCIENTIFIC_NAME_ATTR = "Species Scientific Name";
235   public static final String ORGANISM_CLASSIFICATION_ATTR = "Organism Classification";
236   public static final String ORGANISM_NCBI_TAXON_ID_ATTR  = "Organism NCBI Taxon ID";
237
238
239   enum FormatVariant
240   {
241      EMBL,
242      UniProt
243   }
244
245   private final static Logger LOGGER = Logger.getLogger(GenBank.class.getName());
246
247   static
248   {
249      LOGGER.setLevel(Level.WARNING);
250      LOGGER.setUseParentHandlers(true);
251   }
252
253   //###########################################################################
254   // CONSTRUCTORS
255   //###########################################################################
256
257   //---------------------------------------------------------------------------
258   public EMBL(BioSequenceFactory<T> inSeqFactory)
259   {
260      super(inSeqFactory);
261   }
262
263   //###########################################################################
264   // PUBLIC METHODS
265   //###########################################################################
266
267   //---------------------------------------------------------------------------
268   public static Logger getLogger()
269   {
270      return LOGGER;
271   }
272
273   //---------------------------------------------------------------------------
274   public boolean isEndOfRecord(String inLine)
275   {
276      return inLine.trim().equals("//");
277   }
278
279   //---------------------------------------------------------------------------
280   public boolean hasJanusDelimiter()
281   {
282      return false;
283   }
284
285   //---------------------------------------------------------------------------
286   public T readRecord(BufferedReader inReader)
287         throws SeqIOException
288   {
289      initRecordParsing();
290
291      int lineCount = 0;
292      int maxPreIdLines = 50;
293      boolean idLineFound = false;
294
295      try
296      {
297         mCurrentSeq = getBioSequenceFactory().createSeqObj();
298
299         String line;
300         while ((line = inReader.readLine()) != null)
301         {
302            lineCount++;
303            
304            if (! idLineFound)
305            {
306               if (lineCount > maxPreIdLines)
307               {
308                  throw new SeqFormatException("No EMBL ID line found within " + maxPreIdLines + " lines of the start!");
309               }
310
311               if (line.length() > 2)
312               {
313                  String lineCode = line.substring(0, 2);
314                  if (ID_LINE_CODE.equals(lineCode))
315                  {
316                     idLineFound = true;
317                  }
318                  else
319                  {
320                     continue;
321                  }
322               }
323            }
324            else if (isEndOfRecord(line))
325            {
326               break;
327            }
328
329            if (idLineFound
330                && StringUtil.isSet(line))
331            {
332               parseLine(line);
333
334               if (SQ_LINE_CODE.equals(mCurrentLineCode))
335               {
336                  break;
337               }
338            }
339         }
340
341         if (! idLineFound)
342         {
343            throw new SeqFormatException("No EMBL ID line found!");
344         }
345
346         // The rest of the record is assumed to be sequence
347
348         // Cleanup the sequence to remove spaces and numbers
349         Reader filterReader = new SeqFilterReader(inReader);
350         mCurrentSeq.setSequence(filterReader);
351
352         filterReader.close();
353
354         // Cleanup
355
356         if (mCurrentGenBankFeatureQualifier != null)
357         {
358            // The last qualifier of the feature table may need unquoting if we just finished w/ FEATURES
359            if (mCurrentGenBankFeatureQualifier.getValue().startsWith("\""))
360            {
361               mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue()));
362            }
363            mCurrentGenBankFeatureQualifier = null;
364
365            if (mCurrentSeq instanceof BioSequencePlus)
366            {
367               List<SeqFeature> sourceFeatures = ((BioSequencePlus) mCurrentSeq).getFeatures(GenBankFeatureKey.source);
368               if (CollectionUtil.hasValues(sourceFeatures))
369               {
370                  SeqFeature source = sourceFeatures.get(0);
371                  List<FeatureQualifier> cloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.clone.name());
372                  if (CollectionUtil.hasValues(cloneQualifiers))
373                  {
374                     Clone clone = new Clone(cloneQualifiers.get(0).getValue());
375
376                     List<FeatureQualifier> subcloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.sub_clone.name());
377                     if (CollectionUtil.hasValues(subcloneQualifiers))
378                     {
379                        clone.setSubcloneName(subcloneQualifiers.get(0).getValue());
380                     }
381
382                     mCurrentSeq.setAttribute(CLONE_ATTR, clone);
383                  }
384               }
385            }
386         }
387
388         // TODO: Clean the '.' off the end of feature descriptions
389      }
390      catch (SeqFormatException e)
391      {
392         throw new SeqFormatException("Problem parsing EMBL record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e);
393      }
394      catch (Exception e)
395      {
396         throw new SeqIOException("Problem parsing EMBL record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e);
397      }
398
399      if (! idLineFound)
400      {
401         throw new SeqFormatException("No " + ID_LINE_CODE + " line detected in the EMBL record!");
402      }
403
404      return mCurrentSeq;
405   }
406
407   //###########################################################################
408   // PRIVATE METHODS
409   //###########################################################################
410
411   //---------------------------------------------------------------------------
412   private void initRecordParsing()
413   {
414      mCurrentSeq      = null;
415      mCurrentLineCode = null;
416      mCurrentUniProtFeature = null;
417      mCurrentCitation = null;
418      mCurrentKeywords = "";
419   }
420
421   //---------------------------------------------------------------------------
422   private void parseLine(String inLine)
423   {
424      mCurrentLineCode = inLine.substring(0, 2);
425      switch (mCurrentLineCode)
426      {
427         case XX_LINE_CODE:
428            // Blank line
429            break;
430         case CC_LINE_CODE:
431            // Comment line
432            break;
433         case ID_LINE_CODE:
434            parseID(inLine);
435            break;
436         case AC_LINE_CODE:
437            // Accessions
438            parseAC(inLine);
439            break;
440         case DT_LINE_CODE:
441            // Date
442 //TODO:           parseDT(inLine);
443            break;
444         case DE_LINE_CODE:
445            // Description
446            parseDE(inLine);
447            break;
448         case KW_LINE_CODE:
449            // Keywords
450            parseKW(inLine);
451            break;
452         case OS_LINE_CODE:
453            // Organism species
454            parseOS(inLine);
455            break;
456         case OC_LINE_CODE:
457            // Organism classification
458            parseOC(inLine);
459            break;
460         case RN_LINE_CODE:
461            // Reference number (start of a new reference)
462            mCurrentCitation = new SeqCitation();
463            if (mCurrentSeq instanceof BioSequencePlus)
464            {
465               ((BioSequencePlus) mCurrentSeq).addReference(mCurrentCitation);
466            }
467            break;
468         case RA_LINE_CODE:
469            // Reference author(s)
470            parseRA(inLine);
471            break;
472         case RT_LINE_CODE:
473            // Reference title
474            parseRT(inLine);
475            break;
476         case RX_LINE_CODE:
477            // Reference cross-reference
478            parseRX(inLine);
479            break;
480         case RP_LINE_CODE:
481            // Reference positions
482            parseRP(inLine);
483            break;
484         case RL_LINE_CODE:
485            // Reference location
486            parseRL(inLine);
487            break;
488         case RC_LINE_CODE:
489            // Reference comment
490            //TODO
491            break;
492         case DR_LINE_CODE:
493            // Database cross-reference
494            parseDR(inLine);
495            break;
496         case AH_LINE_CODE:
497         case AS_LINE_CODE:
498            // Assembly info
499            // TODO
500            break;
501         case FH_LINE_CODE:
502            // Feature table header. Ignore
503            break;
504         case FT_LINE_CODE:
505            // Features
506            parseFT(inLine);
507            break;
508         case SQ_LINE_CODE:
509            // Sequence data
510            // TODO
511            break;
512         case GN_LINE_CODE:
513            if (mFormatVariant != FormatVariant.UniProt)
514            {
515               throw new SeqFormatException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!");
516            }
517            break;
518         case OX_LINE_CODE:
519            if (mFormatVariant != FormatVariant.UniProt)
520            {
521               throw new SeqFormatException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!");
522            }
523            parseOX(inLine);
524            break;
525         case PE_LINE_CODE:
526            if (mFormatVariant != FormatVariant.UniProt)
527            {
528               throw new SeqFormatException("Line code " + StringUtil.quote(mCurrentLineCode) + " should not be present in this EMBL format variant!");
529            }
530            break;
531         default:
532            throw new SeqFormatException("Unrecognized line code: " + StringUtil.quote(mCurrentLineCode) + "!");
533      }
534/*
535
536      GenBankKeyword keyword = getLineKeyword(inLine);
537      if (keyword != null)
538      {
539         // Found the start of a new keyword field
540         finishPreviousKeyword();
541
542         mCurrentLineCode = keyword;
543         mCurrentSubkeyword = null;
544
545         parseField(inLine);
546      }
547      else
548      {
549         // Continuation of an existing field
550         if (GenBankKeyword.FEATURES.equals(mCurrentLineCode))
551         {
552            // Features have a special set of feature keys
553            parseFeatures(inLine);
554         }
555         else
556         {
557            mCurrentSubkeyword = getLineSubkeyword(inLine);
558            if (mCurrentSubkeyword != null)
559            {
560               // Start of a new subfield
561            }
562            else
563            {
564               // Continuation of an existing subfield
565            }
566
567            parseField(inLine);
568         }
569      }
570*/
571   }
572
573   //---------------------------------------------------------------------------
574   // Parse the ID line
575   // Ex #1: ID   X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
576   // Ex #2: ID   FMT_ANADE               Reviewed;         312 AA.
577   //
578   // The ID (IDentification) line is always the first line of an entry. The
579   // format of the ID line is:
580   //   ID   <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
581   //   The tokens represent:
582   //   1. Primary accession number
583   //   2. Sequence version number
584   //   3. Topology: 'circular' or 'linear'
585   //   4. Molecule type (see note 1 below)
586   //   5. Data class (see section 3.1)
587   //   6. Taxonomic division (see section 3.2)
588   //   7. Sequence length (see note 2 below)
589   private static final Pattern sEMBL_ID_Pattern1 = Pattern.compile(ID_LINE_CODE + "\\s+(\\w+);\\s+SV\\s+(\\w+);\\s+(\\w+);\\s+(.+?);\\s+(\\w+);\\s+(\\w+);\\s+(\\w+) BP.");
590   private static final Pattern sUniProt_ID_Pattern1 = Pattern.compile(ID_LINE_CODE + "\\s+(\\w+)\\s+(?:Reviewed|Unreviewd);\\s+(\\w+) AA.");
591   private static final Pattern sSeqLengthPattern = Pattern.compile("(\\w+) (AA|BP).");
592
593   private void parseID(String inLine)
594   {
595      Matcher m = sEMBL_ID_Pattern1.matcher(inLine);
596      if (m.matches())
597      {
598         mFormatVariant = FormatVariant.EMBL;
599
600         if (StringUtil.isSet(m.group(1)))
601         {
602            mCurrentSeq.setID(m.group(1) + '.' + m.group(2));
603         }
604
605         if (mCurrentSeq instanceof BioSequencePlus)
606         {
607            if (m.group(3) != null)
608            {
609               ((BioSequencePlus) mCurrentSeq).setSeqTopology(SeqTopology.valueOf(m.group(3)));
610            }
611
612            if (m.group(4) != null)
613            {
614               ((BioSequencePlus) mCurrentSeq).setMolType(MolType.retrieveOrCreateValueOf(m.group(4)));
615            }
616
617            mCurrentSeq.setAttribute(EMBL_DATA_CLASS_ATTR, EMBL_DataClass.valueOf(m.group(5)));
618
619            ((BioSequencePlus) mCurrentSeq).setSeqRepositoryDivision(EMBL_TaxonDivision.valueOf(m.group(6)));
620         }
621
622         mSeqLengthFromIdLine = Integer.parseInt(m.group(7));
623      }
624      else
625      {
626         m = sUniProt_ID_Pattern1.matcher(inLine);
627         if (m.matches())
628         {
629            mFormatVariant = FormatVariant.UniProt;
630         }
631         else
632         {
633            // Non-standard ID line format. Do the best we can.
634            String[] pieces = inLine.substring(2).split(";");
635
636            int pieceIndex = 1;
637
638            String id = pieces[0].trim().split("\\s+")[0].trim();
639            if (pieces.length > 1
640                  && pieces[1].trim().startsWith("SV"))
641            {
642               id += "." + pieces[1].trim().substring(2).trim();
643               pieceIndex++;
644            }
645
646            mCurrentSeq.setID(id);
647
648            if (mCurrentSeq instanceof BioSequencePlus)
649            {
650               BioSequencePlus bioSequencePlus = (BioSequencePlus) mCurrentSeq;
651
652               boolean topologyFound = false;
653               boolean molTypeFound  = false;
654               boolean dataClassFound  = false;
655               boolean divisionFound  = false;
656
657               for (; pieceIndex < pieces.length; pieceIndex++)
658               {
659                  String field = pieces[pieceIndex].trim();
660
661                  if (! topologyFound)
662                  {
663                     SeqTopology seqTopology = SeqTopology.valueOf(field);
664                     if (seqTopology != null)
665                     {
666                        bioSequencePlus.setSeqTopology(seqTopology);
667                        topologyFound = true;
668                        continue;
669                     }
670                  }
671
672                  if (! molTypeFound)
673                  {
674                     MolType molType = MolType.valueOf(field);
675                     if (molType != null)
676                     {
677                        bioSequencePlus.setMolType(MolType.valueOf(field));
678                        molTypeFound = true;
679                        continue;
680                     }
681                  }
682
683                  if (! dataClassFound)
684                  {
685                     EMBL_DataClass dataClass = EMBL_DataClass.valueOf(field);
686                     if (dataClass != null)
687                     {
688                        mCurrentSeq.setAttribute(EMBL_DATA_CLASS_ATTR, dataClass);
689                        dataClassFound = true;
690                        continue;
691                     }
692                  }
693
694                  if (! divisionFound)
695                  {
696                     EMBL_TaxonDivision div = EMBL_TaxonDivision.valueOf(field);
697                     if (div != null)
698                     {
699                        bioSequencePlus.setSeqRepositoryDivision(div);
700                        divisionFound = true;
701                        continue;
702                     }
703                  }
704
705                  if (pieceIndex == pieces.length - 1)
706                  {
707                     Matcher seqLengthMatcher = sSeqLengthPattern.matcher(field);
708                     if (seqLengthMatcher.matches())
709                     {
710                        mSeqLengthFromIdLine = Integer.parseInt(seqLengthMatcher.group(1));
711                        if (seqLengthMatcher.group(2).equalsIgnoreCase("BP"))
712                        {
713                           mFormatVariant = FormatVariant.EMBL;
714                        }
715                        else
716                        {
717                           mFormatVariant = FormatVariant.UniProt;
718                        }
719                     }
720                  }
721               }
722            }
723         }
724      }
725   }
726
727   //---------------------------------------------------------------------------
728   private void parseAC(String inLine)
729   {
730      String[] pieces = inLine.substring(2).split(";\\s*");
731
732      if (null == mCurrentSeq.getID())
733      {
734         mCurrentSeq.setID(pieces[0].trim());
735      }
736
737      // TODO: handle additional ids
738   }
739
740   //---------------------------------------------------------------------------
741   private void parseDE(String inLine)
742   {
743      String description = inLine.substring(2).trim();
744      if (StringUtil.isSet(mCurrentSeq.getDescription()))
745      {
746         description = mCurrentSeq.getDescription() + " " + description;
747      }
748
749      mCurrentSeq.setDescription(description);
750   }
751
752   //---------------------------------------------------------------------------
753   // Parse keywords
754   private void parseKW(String inLine)
755   {
756      String keywordString = inLine.substring(2).trim();
757
758      if (mCurrentSeq instanceof BioSequencePlus)
759      {
760         mCurrentKeywords += keywordString;
761
762         if (mCurrentKeywords.endsWith("."))
763         {
764            String[] keywords = mCurrentKeywords.split("[;\\.]");
765            for (String keyword : keywords)
766            {
767               if (StringUtil.isSet(keyword))
768               {
769                  ((BioSequencePlus) mCurrentSeq).addKeyword(keyword.trim());
770               }
771            }
772         }
773      }
774   }
775
776   //---------------------------------------------------------------------------
777   // Parse the reference authors
778   private void parseRA(String inLine)
779   {
780      String[] authors = null;
781      if (mFormatVariant.equals(FormatVariant.EMBL))
782      {
783         authors = inLine.substring(2).split(";\\s*");
784      }
785      else if (mFormatVariant.equals(FormatVariant.UniProt))
786      {
787         authors = inLine.substring(2).split(",\\s*");
788      }
789
790      if (authors != null)
791      {
792         for (String author : authors)
793         {
794            if (StringUtil.isSet(author))
795            {
796               mCurrentCitation.addAuthor(new Author(author.trim()));
797            }
798         }
799      }
800   }
801
802   //---------------------------------------------------------------------------
803   // Parse the reference title
804   private void parseRT(String inLine)
805   {
806      String title = inLine.substring(2).trim();
807      if (title.endsWith(";"))
808      {
809         title = title.substring(0, title.length() - 1);
810      }
811
812      if (StringUtil.isSet(title))
813      {
814         if (mCurrentCitation.getTitle() != null)
815         {
816            title = mCurrentCitation.getTitle() + " " + title;
817         }
818
819         if (StringUtil.isQuoted(title))
820         {
821            title = StringUtil.unquote(title);
822         }
823
824         mCurrentCitation.setTitle(title);
825      }
826   }
827
828   //---------------------------------------------------------------------------
829   private void parseRX(String inLine)
830   {
831      String xref = inLine.substring(2).trim();
832      if (xref.endsWith("."))
833      {
834         xref = xref.substring(0, xref.length() - 1);
835      }
836
837      String[] pieces = xref.split(";\\s*");
838
839      if (pieces[0].equals("PUBMED"))
840      {
841         mCurrentCitation.setPubMedId(pieces[1]);
842      }
843
844      // TODO: handle other x-refs
845   }
846
847   //---------------------------------------------------------------------------
848   // Parse reference position value
849   private void parseRP(String inLine)
850   {
851      String positionString = inLine.substring(2).trim();
852      if (StringUtil.isSet(positionString))
853      {
854         String[] positionStrings = positionString.split("\\s*-\\s*");
855         if (2 == positionStrings.length)
856         {
857            mCurrentCitation.setSeqLocation(new SeqLocation().setStart(Integer.parseInt(positionStrings[0].trim())).setEnd(Integer.parseInt(positionStrings[1].trim())));
858         }
859      }
860   }
861
862   //---------------------------------------------------------------------------
863   // Parse reference location
864   private void parseRL(String inLine)
865   {
866      String stringValue = inLine.substring(2).trim();
867      if (StringUtil.isSet(stringValue))
868      {
869         mCurrentCitation.appendRawContent(stringValue);
870      }
871   }
872
873   //---------------------------------------------------------------------------
874   private void parseOS(String inLine)
875   {
876      String text = inLine.substring(2).trim();
877
878      if (null == mCurrentSeq.getAttribute(SPECIES_SCIENTIFIC_NAME_ATTR))
879      {
880         mCurrentSeq.setAttribute(SPECIES_SCIENTIFIC_NAME_ATTR, text);
881      }
882      else
883      {
884         mCurrentSeq.setAttribute(SPECIES_SCIENTIFIC_NAME_ATTR, mCurrentSeq.getAttribute(SPECIES_SCIENTIFIC_NAME_ATTR) + " " + text);
885      }
886   }
887
888   //---------------------------------------------------------------------------
889   private void parseOC(String inLine)
890   {
891      String text = inLine.substring(2).trim();
892
893      if (null == mCurrentSeq.getAttribute(ORGANISM_CLASSIFICATION_ATTR))
894      {
895         mCurrentSeq.setAttribute(ORGANISM_CLASSIFICATION_ATTR, text);
896      }
897      else
898      {
899         mCurrentSeq.setAttribute(ORGANISM_CLASSIFICATION_ATTR, mCurrentSeq.getAttribute(ORGANISM_CLASSIFICATION_ATTR) + " " + text);
900      }
901   }
902
903
904   //---------------------------------------------------------------------------
905   // Parse organism taxonomy cross-reference
906   // OX   Taxonomy_database_Qualifier=Taxonomic code;
907   // Example:
908   //   OX   NCBI_TaxID=9606;
909   private void parseOX(String inLine)
910   {
911      String[] pieces = inLine.substring(2).trim().split("=");
912
913      // Remove trailing ';'
914      if (pieces[1].endsWith(";"))
915      {
916         pieces[1] = pieces[1].substring(0, pieces[1].length() - 1);
917      }
918
919      mCurrentSeq.setAttribute(ORGANISM_NCBI_TAXON_ID_ATTR, Integer.parseInt(pieces[1]));
920      if (mCurrentSeq instanceof BioSequencePlus)
921      {
922         NCBITaxon taxon = NCBITaxon.getByTaxonId(Integer.parseInt(pieces[1]));
923         if (taxon != null)
924         {
925            ((BioSequencePlus) mCurrentSeq).setNCBITaxon(taxon);
926         }
927      }
928   }
929
930
931   //---------------------------------------------------------------------------
932   // Parse database cross-reference
933   // The format of the DR line is:
934   //   DR   RESOURCE_ABBREVIATION; RESOURCE_IDENTIFIER; OPTIONAL_INFORMATION_1[; OPTIONAL_INFORMATION_2][; OPTIONAL_INFORMATION_3].
935   // Example:
936   //   DR   EMBL; U29082; AAA68403.1; -; Genomic_DNA.
937   private void parseDR(String inLine)
938   {
939      String[] pieces = inLine.substring(2).trim().split(";\\s*");
940
941      // Remove trailing '.' from the last piece
942      if (pieces[pieces.length - 1].endsWith("."))
943      {
944         pieces[pieces.length - 1] = pieces[pieces.length - 1].substring(0, pieces[pieces.length - 1].length() - 1);
945      }
946
947      DbXref xref = new DbXref(pieces[0], pieces[1]);
948      if (pieces.length > 2)
949      {
950         StringBuilderPlus description = new StringBuilderPlus().setDelimiter("; ");
951         for (int i = 2; i < pieces.length; i++)
952         {
953            description.delimitedAppend(pieces[i]);
954         }
955         xref.setDescription(description.toString());
956      }
957
958      if (mCurrentSeq instanceof BioSequencePlus)
959      {
960         ((BioSequencePlus) mCurrentSeq).addDbXref(xref);
961      }
962   }
963
964
965
966   //---------------------------------------------------------------------------
967   // Parse feature
968   private void parseFT(String inLine)
969   {
970      if (mFormatVariant.equals(FormatVariant.EMBL))
971      {
972         pareGenBankFeatureTableLine(inLine);
973      }
974      else if (mFormatVariant.equals(FormatVariant.UniProt))
975      {
976         pareUniProtFeatureTableLine(inLine);
977      }
978   }
979
980   private static final Pattern sGenBankFeatureQualifierPattern = Pattern.compile("/(\\S+?)(?:=(.+))?");
981
982   //---------------------------------------------------------------------------
983   private void pareGenBankFeatureTableLine(String inLine)
984   {
985      // Is there a feature key on this line?
986      String featureKeyString = inLine.substring(5, 20).trim();
987      if (StringUtil.isSet(featureKeyString))
988      {
989         GenBankFeatureKey featureKey = GenBankFeatureKey.valueOf(featureKeyString);
990         if (null == featureKey)
991         {
992            throw new SeqFormatException(StringUtil.singleQuote(featureKeyString) + " is not a recognized feature key!");
993         }
994
995         String locationString = inLine.substring(21).trim();
996         mCurrentGenBankFeature = new GenBankFeature(featureKey, new GenBankFeatureLocation(locationString));
997         if (mCurrentSeq instanceof BioSequencePlus)
998         {
999            ((BioSequencePlus) mCurrentSeq).addFeature(mCurrentGenBankFeature);
1000         }
1001
1002         // Unquote the previous qualifier if necessary
1003         if (mCurrentGenBankFeatureQualifier != null
1004               && mCurrentGenBankFeatureQualifier.getValue().startsWith("\""))
1005         {
1006            mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue()));
1007         }
1008         mCurrentGenBankFeatureQualifier = null;
1009      }
1010      else
1011      {
1012         String content = inLine.substring(21).trim();
1013
1014         Matcher m = sGenBankFeatureQualifierPattern.matcher(content);
1015         if (m.matches())
1016         {
1017            // New qualifier
1018
1019            // Unquote the previous qualifier if necessary
1020            if (mCurrentGenBankFeatureQualifier != null
1021                  && mCurrentGenBankFeatureQualifier.getValue().startsWith("\""))
1022            {
1023               mCurrentGenBankFeatureQualifier.setValue(StringUtil.unquote(mCurrentGenBankFeatureQualifier.getValue()));
1024            }
1025
1026            GenBankFeatureQualifierName qualifierName = GenBankFeatureQualifierName.valueOf(m.group(1));
1027            if (null == qualifierName)
1028            {
1029               throw new SeqFormatException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!");
1030            }
1031
1032            mCurrentGenBankFeatureQualifier = new GenBankFeatureQualifier(qualifierName);
1033            mCurrentGenBankFeature.addQualifier(mCurrentGenBankFeatureQualifier);
1034
1035            String value = m.group(2);
1036            if (value != null)
1037            {
1038               mCurrentGenBankFeatureQualifier.appendToValue(value);
1039            }
1040         }
1041         else if (mCurrentGenBankFeatureQualifier != null)
1042         {
1043            // Continuation of a previous qualifier
1044            mCurrentGenBankFeatureQualifier.appendToValue(content);
1045         }
1046         else if (mCurrentGenBankFeature != null)
1047         {
1048            // Continuation of a feature location
1049            mCurrentGenBankFeature.getLocation().append(content);
1050         }
1051      }
1052   }
1053
1054
1055   // Example UniProt FT entry:
1056   //  FT   CHAIN         1    312       Methionyl-tRNA formyltransferase.
1057   //  FT                                /FTId=PRO_1000077286.
1058
1059   private static final Pattern sUniProt_FT_FirstLinePattern = Pattern.compile(FT_LINE_CODE + "\\s{1,5}(\\w+)\\s{1,10}([\\?\\<]?\\d*)\\s{1,10}([\\?\\>]?\\d*)\\s+(.+)");
1060   private static final Pattern sUniProt_FT_AdditionalLinePattern = Pattern.compile(FT_LINE_CODE + "\\s{20,}(.+)");
1061
1062   //---------------------------------------------------------------------------
1063   private void pareUniProtFeatureTableLine(String inLine)
1064   {
1065      Matcher m = sUniProt_FT_FirstLinePattern.matcher(inLine);
1066      if (m.matches())
1067      {
1068         UniProtFeatureKey featureKey = UniProtFeatureKey.valueOf(m.group(1));
1069         UniProtFeatureLocation location = new UniProtFeatureLocation(m.group(2), m.group(3));
1070         mCurrentUniProtFeature = new UniProtFeature(featureKey, location).setDescription(m.group(4));
1071         if (mCurrentSeq instanceof BioSequencePlus)
1072         {
1073            ((BioSequencePlus) mCurrentSeq).addFeature(mCurrentUniProtFeature);
1074         }
1075      }
1076      else
1077      {
1078         m = sUniProt_FT_AdditionalLinePattern.matcher(inLine);
1079         if (m.matches())
1080         {
1081            if (m.group(1).startsWith("/"))
1082            {
1083               String[] pieces = m.group(1).substring(1).split("=");
1084               if (2 != pieces.length)
1085               {
1086                  throw new SeqFormatException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!");
1087               }
1088
1089               UniProtFeatureQualifierName qualifierName = UniProtFeatureQualifierName.valueOf(pieces[0]);
1090               if (null == qualifierName)
1091               {
1092                  throw new SeqFormatException(StringUtil.singleQuote(pieces[0]) + " is not a recognized qualifier name!");
1093               }
1094
1095               mCurrentUniProtFeatureQualifier = new UniProtFeatureQualifier(qualifierName);
1096               mCurrentUniProtFeature.addQualifier(mCurrentUniProtFeatureQualifier);
1097
1098               // Trim trailing period
1099               if (pieces[1].endsWith("."))
1100               {
1101                  pieces[1] = pieces[1].substring(0, pieces[1].length() - 1);
1102               }
1103               mCurrentUniProtFeatureQualifier.appendToValue(pieces[1]);
1104            }
1105            else
1106            {
1107               mCurrentUniProtFeature.appendDescription(m.group(1));
1108            }
1109         }
1110      }
1111   }
1112
1113   //###########################################################################
1114   // INNER CLASS
1115   //###########################################################################
1116
1117   class SeqFilterReader extends LettersOnlyReader
1118   {
1119      //---------------------------------------------------------------------------
1120      public SeqFilterReader(Reader inReader)
1121      {
1122         super(inReader);
1123      }
1124
1125      //---------------------------------------------------------------------------
1126      @Override
1127      public int read()
1128            throws IOException
1129      {
1130         int returnChar;
1131
1132         do
1133         {
1134            returnChar = innerRead();
1135         }
1136         while (returnChar >= 0
1137               && (Character.isWhitespace(returnChar)
1138                   || Character.isDigit(returnChar)
1139                   || returnChar == '/'));
1140
1141         return returnChar;
1142      }
1143   }
1144
1145}