001package com.hfg.bio.seq.format;
002
003import java.io.BufferedReader;
004import java.io.BufferedWriter;
005import java.io.IOException;
006import java.io.OutputStream;
007import java.io.OutputStreamWriter;
008import java.io.Reader;
009import java.io.StringWriter;
010import java.io.Writer;
011import java.text.ParseException;
012import java.text.SimpleDateFormat;
013import java.util.ArrayList;
014import java.util.Collection;
015import java.util.Collections;
016import java.util.List;
017import java.util.Map;
018import java.util.Set;
019import java.util.logging.Level;
020import java.util.logging.Logger;
021import java.util.regex.Matcher;
022import java.util.regex.Pattern;
023
024import com.hfg.bio.DbXref;
025import com.hfg.bio.seq.BioSequencePlus;
026import com.hfg.bio.seq.BioSequenceType;
027import com.hfg.bio.seq.Clone;
028import com.hfg.bio.seq.SeqLocation;
029import com.hfg.bio.seq.SeqTopology;
030import com.hfg.bio.seq.format.feature.FeatureQualifier;
031import com.hfg.bio.seq.format.feature.SeqFeature;
032import com.hfg.bio.seq.BioSequence;
033import com.hfg.bio.seq.BioSequenceFactory;
034import com.hfg.bio.seq.format.feature.genbank.*;
035import com.hfg.bio.seq.format.feature.qualifier.MolType;
036import com.hfg.bio.seq.format.genbank.GenBankKeyword;
037import com.hfg.bio.seq.format.genbank.GenBankSubkeyword;
038import com.hfg.bio.seq.format.genbank.InvalidGenBankKeywordException;
039import com.hfg.bio.seq.format.genbank.InvalidGenBankSubkeywordException;
040import com.hfg.bio.taxonomy.ncbi.NCBIGenBankDivision;
041import com.hfg.bio.taxonomy.ncbi.NCBITaxon;
042import com.hfg.citation.Author;
043import com.hfg.citation.CitationType;
044import com.hfg.citation.Journal;
045import com.hfg.citation.PatentData;
046import com.hfg.datetime.DateUtil;
047import com.hfg.util.StringBuilderPlus;
048import com.hfg.util.StringUtil;
049import com.hfg.util.collection.CollectionUtil;
050import com.hfg.util.collection.OrderedMap;
051import com.hfg.util.io.LettersOnlyReader;
052
053//------------------------------------------------------------------------------
054/**
055 GenBank sequence format.
056 <p>
057 See <a href='ftp://ftp.ncbi.nlm.nih.gov/genbank/gbrel.txt'>ftp://ftp.ncbi.nlm.nih.gov/genbank/gbrel.txt</a>
058 </p>
059 <p>
060 See <a href='http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html'>http://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html</a>
061 </p>
062 @author J. Alex Taylor, hairyfatguy.com
063 */
064//------------------------------------------------------------------------------
065// com.hfg Library
066//
067// This library is free software; you can redistribute it and/or
068// modify it under the terms of the GNU Lesser General Public
069// License as published by the Free Software Foundation; either
070// version 2.1 of the License, or (at your option) any later version.
071//
072// This library is distributed in the hope that it will be useful,
073// but WITHOUT ANY WARRANTY; without even the implied warranty of
074// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
075// Lesser General Public License for more details.
076//
077// You should have received a copy of the GNU Lesser General Public
078// License along with this library; if not, write to the Free Software
079// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
080//
081// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
082// jataylor@hairyfatguy.com
083//------------------------------------------------------------------------------
084
085public class GenBank<T extends BioSequence> extends ReadableSeqFormatBase<T>   implements WritableSeqFormat<T>
086{
087
088   // Variables used during parsing
089   private T                 mCurrentSeq;
090   private GenBankKeyword    mCurrentKeyword;
091   private GenBankSubkeyword mCurrentSubkeyword;
092   private GenBankFeature    mCurrentFeature;
093   private GenBankFeatureQualifier mCurrentFeatureQualifier;
094   private SeqCitation       mCurrentReference;
095   private Integer           mSeqLengthFromLocusLine;
096
097   private SimpleDateFormat  mDateFormat = new SimpleDateFormat("dd-MMM-yyyy");
098
099
100   private int mMaxExceptionsPerRecord = 0;
101
102//   private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s{7}(\\w+)?\\s+(?:\\w+\\s+)?(\\d+)\\s(?:bp|aa)\\s+(?:[\\-\\w]+)?(?:\\s+(\\w+))?\\s+(\\w{3})\\s+(\\S{11})");
103//   private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s{7}(\\w+)?\\s+(?:\\w+\\s+)?(\\d+)\\s(?:bp|aa)(?:\\s+((?:[sdm]s-)?(?:NA|DNA|RNA|tRNA|rRNA|mRNA|uRNA)))?(?:\\s+(linear|circular))?(?:\\s+(\\w{3}))?(?:\\s+(\\S{10,11}))?");
104   private static final Pattern sLocusPattern = Pattern.compile("LOCUS\\s+(\\S+)?\\s+(?:\\S+\\s+)?(\\d+)\\s(?:bp|aa)(?:\\s+((?:[sdm]s-)?(?:NA|DNA|RNA|tRNA|rRNA|mRNA|uRNA|cRNA)))?(?:\\s+(linear|circular))?(?:\\s+(\\w{3}))?(?:\\s+(\\S{10,11}))?");
105   private static final Pattern sFeatureQualifierPattern = Pattern.compile("/(\\S+?)(?:=(.+))?");
106   private static final Pattern sReferenceLocationPattern = Pattern.compile("\\(bases (\\d+) to (\\d+)\\)");
107   private static final Pattern sReferencePatentPattern = Pattern.compile("Patent: (\\w{2} \\S+)\\s+(\\d+)\\s+(\\d{2}-\\w{3}-\\d{4});(.+)?");
108   private static final Pattern sReferenceDirectSubmissionPattern = Pattern.compile("Submitted \\((\\d{2}-\\w{3}-(\\d{4}))\\)\\s+(.*)");
109
110   private static final Pattern sPatentLocationPattern = Pattern.compile(".+, \\w{2}");
111   private static final Pattern sPatentParensLocationPattern = Pattern.compile(".+ \\(\\w{2}\\)");
112
113   // Examples:
114   // Thesis
115   // Thesis (1996) Utrecht University, The Netherlands
116   private static final Pattern sReferenceThesisPattern = Pattern.compile("Thesis(?: \\((\\d{4})\\)\\s+(.*))?");
117
118   // Examples:
119   // Proc. Natl. Acad. Sci. U.S.A. 82 (3), 844-848 (1985)
120   // Front Immunol 9, 1079 (2018)
121   // Nat Commun (2018) In press
122   // Dev. Comp. Immunol. 25 (5-6), 387-401
123   // J. Exp. Zool. 295B (1), 45-58 (2003)
124   // PLoS ONE 8 (8), E70650 (2013)
125   // Mol. Phylogenet. Evol. 94 (Pt B), 577-590 (2016)
126   private static final Pattern sReferenceJournalPattern = Pattern.compile("(.+?)(?:\\s+(\\S+)(?:\\s+\\(([^\\)]+)\\))?,\\s+(\\d+(?:\\-\\d+)?|E\\d+))?(?:\\s+\\((\\d{4})\\))?(?: In press)?", Pattern.CASE_INSENSITIVE);
127
128   private static final SimpleDateFormat sDateFormat = new SimpleDateFormat("dd-MMM-yyyy");
129
130   public static final String COMMENT_ATTR     = "Comment";
131   public static final String NCBI_GI_ATTR     = "NCBI GI";
132   public static final String CONTIG_ATTR      = "Contig";
133
134   private final static Logger LOGGER = Logger.getLogger(GenBank.class.getName());
135
136   static
137   {
138      LOGGER.setLevel(Level.WARNING);
139      LOGGER.setUseParentHandlers(true);
140   }
141
142   //###########################################################################
143   // CONSTRUCTORS
144   //###########################################################################
145
146   //---------------------------------------------------------------------------
147   public GenBank(BioSequenceFactory<T> inSeqFactory)
148   {
149      super(inSeqFactory);
150   }
151
152   //###########################################################################
153   // PUBLIC METHODS
154   //###########################################################################
155
156   //---------------------------------------------------------------------------
157   public static Logger getLogger()
158   {
159      return LOGGER;
160   }
161
162   //---------------------------------------------------------------------------
163   public boolean isEndOfRecord(String inLine)
164   {
165      // Trying for something slightly more efficient than inLine.trim().equals("//")
166      return inLine.startsWith("//") && 2 == inLine.trim().length();
167   }
168
169   //---------------------------------------------------------------------------
170   public boolean hasJanusDelimiter()
171   {
172      return false;
173   }
174
175   //---------------------------------------------------------------------------
176   /**
177    Specify the maximum number of Exceptions to tolerate per record. Defaults to zero.
178    This mechanism will only work with sequences objects that implement the BioSequencePlus interface.
179    If a record produces less than the specified maximum number of Exceptions, the
180    Exceptions can be retrieved via the getParseExceptions() method on the
181    BioSequencePlus sequence object.
182    * @param inValue the maximum number of Exceptions to tolerate per record
183    * @return this format object to facilitate method chaining.
184    */
185   public GenBank<T> setMaxExceptionsPerRecord(int inValue)
186   {
187      mMaxExceptionsPerRecord = inValue;
188      return this;
189   }
190
191   //---------------------------------------------------------------------------
192   public T readRecord(BufferedReader inReader)
193         throws SeqIOException
194   {
195      initRecordParsing();
196
197      int lineCount = 0;
198      int maxPreLocusLines = 50;
199      boolean locusLineFound = false;
200      boolean originLineFound = false;
201
202      mCurrentSeq = getBioSequenceFactory().createSeqObj();
203
204      try
205      {
206         String line;
207         while ((line = inReader.readLine()) != null)
208         {
209            lineCount++;
210            try
211            {
212               if (!locusLineFound)
213               {
214                  if (lineCount > maxPreLocusLines)
215                  {
216                     throw new SeqFormatException("No GenBank " + GenBankKeyword.LOCUS
217                                                  + " line found within " + maxPreLocusLines
218                                                  + " lines of the start!");
219                  }
220
221                  try
222                  {
223                     GenBankKeyword keyword = getLineKeyword(line);
224                     if (GenBankKeyword.LOCUS.equals(keyword))
225                     {
226                        locusLineFound = true;
227                     }
228                     else
229                     {
230                        continue;
231                     }
232                  }
233                  catch (InvalidGenBankKeywordException e)
234                  {
235                     // Ignore
236                     continue;
237                  }
238               }
239               else if (isEndOfRecord(line))
240               {
241                  break;
242               }
243
244               if (locusLineFound
245                   && StringUtil.isSet(line))
246               {
247                  parseLine(line);
248
249                  if (GenBankKeyword.ORIGIN.equals(mCurrentKeyword))
250                  {
251                     originLineFound = true;
252                     break;
253                  }
254               }
255            }
256            catch(Exception e)
257            {
258               SeqIOException seqIOException = new SeqIOException("Problem parsing "
259                     + (StringUtil.isSet(mCurrentSeq.getID()) ? mCurrentSeq.getID() + " " : "")
260                     + "record line " + lineCount + " : " + StringUtil.singleQuote(line), e);
261
262               if (mMaxExceptionsPerRecord > 0
263                   && mCurrentSeq instanceof BioSequencePlus
264                   && (! ((BioSequencePlus) mCurrentSeq).hadParseExceptions()
265                       || ((BioSequencePlus) mCurrentSeq).getParseExceptions().size() < mMaxExceptionsPerRecord))
266               {
267                  ((BioSequencePlus) mCurrentSeq).addParseException(seqIOException);
268                  GenBank.getLogger().warning(e.getMessage());
269               }
270               else
271               {
272                  throw seqIOException;
273               }
274            }
275         }
276
277         if (! locusLineFound)
278         {
279            throw new SeqFormatException("No GenBank LOCUS line found!");
280         }
281
282         if (originLineFound)
283         {
284            // The rest of the record is assumed to be sequence
285
286            // Cleanup the sequence to remove spaces and numbers
287            //        Reader filterReader = new GenBankSeqFilterReader(inReader);
288            //        mCurrentSeq.setSequence(filterReader);
289
290            //        filterReader.close();
291            mCurrentSeq.setSequence(inReader);
292         }
293
294         inReader.close();
295      }
296      catch (Exception e)
297      {
298         throw new SeqIOException("Problem parsing GenBank record" + (StringUtil.isSet(mCurrentSeq.getID()) ? " " + mCurrentSeq.getID() : "") + "!", e);
299      }
300
301      if (! locusLineFound)
302      {
303         throw new SeqFormatException("No LOCUS line detected in the GenBank record!");
304      }
305
306      return mCurrentSeq;
307   }
308
309   //---------------------------------------------------------------------------
310   public String write(Collection<T> inSeqs)
311         throws SeqIOException
312   {
313      StringWriter writer = new StringWriter();
314      for (T seq : inSeqs)
315      {
316         write(seq, writer);
317      }
318
319      return writer.toString();
320   }
321
322   //---------------------------------------------------------------------------
323   public String write(T inSeq)
324         throws SeqIOException
325   {
326      StringWriter writer = new StringWriter();
327
328      write(inSeq, writer);
329
330      return writer.toString();
331   }
332
333   //---------------------------------------------------------------------------
334   public void write(T inSeq, OutputStream inStream)
335         throws SeqIOException
336   {
337      Writer writer = new OutputStreamWriter(inStream);
338      write(inSeq, writer);
339      try
340      {
341         writer.flush();
342      }
343      catch (Exception e)
344      {
345         throw new SeqIOException(e);
346      }
347   }
348
349   //---------------------------------------------------------------------------
350   public void write(T inSeq, Writer inWriter)
351         throws SeqIOException
352   {
353      Reader seqReader = null;
354      BufferedWriter writer = null;
355      try
356      {
357         try
358         {
359            if (writer instanceof BufferedWriter)
360            {
361               writer = (BufferedWriter) inWriter;
362            } else
363            {
364               writer = new BufferedWriter(inWriter, 8196);
365            }
366
367            // Write the LOCUS line
368            writeLocus(inSeq, writer);
369
370            // Write the DEFINTION line(s)
371            writeDefinition(inSeq, writer);
372
373            // Write the ACCESSION line
374            writeAccession(inSeq, writer);
375
376            // Write the VERSION line
377            writeVersion(inSeq, writer);
378
379            // TODO: SOURCE
380            
381            if (inSeq instanceof BioSequencePlus)
382            {
383               BioSequencePlus seqPlus = (BioSequencePlus) inSeq;
384
385               if (CollectionUtil.hasValues(seqPlus.getDbXrefs()))
386               {
387                  writeDBLinks(seqPlus.getDbXrefs(), writer);
388               }
389
390               if (CollectionUtil.hasValues(seqPlus.getReferences()))
391               {
392                  writeReferences(seqPlus.getReferences(), writer);
393               }
394
395               // Write features
396               if (CollectionUtil.hasValues(seqPlus.getFeatures()))
397               {
398                  writer.write(GenBankKeyword.FEATURES + "             Location/Qualifiers\n");
399                  for (SeqFeature seqFeature : seqPlus.getFeatures())
400                  {
401                     writeFeature(seqFeature, writer);
402                  }
403               }
404            }
405
406            // Write the sequence lines
407            writer.write(GenBankKeyword.ORIGIN + "\n");
408
409            seqReader = inSeq.getSequenceReader();
410
411            int bufferSize = 60;
412            char[] buffer = new char[bufferSize];
413            int residueNum = 1;
414            int numBytesRead;
415
416            while ((numBytesRead = seqReader.read(buffer)) != -1)
417            {
418               if (numBytesRead < bufferSize)
419               {
420                  int secondNumBytesRead = seqReader.read(buffer, numBytesRead, buffer.length - numBytesRead);
421                  if (secondNumBytesRead != -1)
422                  {
423                     numBytesRead += secondNumBytesRead;
424                  }
425               }
426               
427               writer.write(String.format("%9d", residueNum));
428               for (int i = 0; i < numBytesRead; i += 10)
429               {
430                  writer.write(" ");
431                  writer.write(buffer, i, i + 10 > numBytesRead ? numBytesRead - i: 10);
432               }
433
434               writer.write("\n");
435
436
437               residueNum += numBytesRead;
438            }
439
440            // Write end of record line
441            writer.write("//\n");
442         }
443         finally
444         {
445            if (seqReader != null)
446            {
447               seqReader.close();
448            }
449
450            if (writer != null)
451            {
452               writer.flush();
453            }
454         }
455      }
456      catch (SeqIOException e)
457      {
458         throw e;
459      }
460      catch (Exception e)
461      {
462         throw new SeqIOException(e);
463      }
464   }
465
466   //###########################################################################
467   // PRIVATE METHODS
468   //###########################################################################
469
470   //---------------------------------------------------------------------------
471   private void initRecordParsing()
472   {
473      mCurrentSeq        = null;
474      mCurrentKeyword    = null;
475      mCurrentSubkeyword = null;
476      mCurrentFeature    = null;
477      mCurrentFeatureQualifier = null;
478      mCurrentReference  = null;
479   }
480
481   //---------------------------------------------------------------------------
482   // A keyword starts in column one and has a maximum of 10 characters.
483   private GenBankKeyword getLineKeyword(String inLine)
484   {
485      GenBankKeyword keyword = null;
486      if (Character.isLetter(inLine.charAt(0)))
487      {
488         String keywordString = (inLine.length() > 11 ? inLine.substring(0, 11) : inLine).trim();
489         if (StringUtil.isSet(keywordString))
490         {
491            keyword = GenBankKeyword.valueOf(keywordString);
492            if (null == keyword)
493            {
494               throw new InvalidGenBankKeywordException(StringUtil.singleQuote(keywordString) + " is not a recognized GenBank keyword!");
495            }
496         }
497      }
498
499      return keyword;
500   }
501
502   //---------------------------------------------------------------------------
503   // A subkeyword starts in column three and has a maximum of 8 characters.
504   private GenBankSubkeyword getLineSubkeyword(String inLine)
505   {
506      GenBankSubkeyword subkeyword = null;
507      if (Character.isWhitespace(inLine.charAt(0))
508          && Character.isWhitespace(inLine.charAt(1)))
509      {
510         String subkeywordString = (inLine.length() > 11 ? inLine.substring(2, 11) : inLine).trim();
511         if (StringUtil.isSet(subkeywordString))
512         {
513            subkeyword = GenBankSubkeyword.valueOf(subkeywordString);
514            if (null == subkeyword)
515            {
516               throw new InvalidGenBankSubkeywordException(StringUtil.singleQuote(subkeywordString) + " is not a recognized GenBank subkeyword!");
517            }
518            else if (! mCurrentKeyword.allowsSubkeyword(subkeyword))
519            {
520               throw new InvalidGenBankSubkeywordException(StringUtil.singleQuote(subkeywordString) + " is not a recognized GenBank subkeyword of " + mCurrentKeyword + "!");
521            }
522         }
523      }
524
525      return subkeyword;
526   }
527
528   //---------------------------------------------------------------------------
529   private void finishPreviousKeyword()
530      throws ParseException
531   {
532      if (GenBankKeyword.DEFINITION.equals(mCurrentKeyword))
533      {
534         finishDefinition();
535      }
536      else if (GenBankKeyword.REFERENCE.equals(mCurrentKeyword))
537      {
538         finishReference();
539      }
540      else if (mCurrentFeatureQualifier != null)
541      {
542         // The last qualifier of the feature table may need unquoting if we just finished w/ FEATURES
543         if (mCurrentFeatureQualifier.getValue().startsWith("\""))
544         {
545            mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue()));
546         }
547         mCurrentFeatureQualifier = null;
548
549         if (mCurrentSeq instanceof BioSequencePlus)
550         {
551            List<SeqFeature> sourceFeatures = ((BioSequencePlus) mCurrentSeq).getFeatures(GenBankFeatureKey.source);
552            if (CollectionUtil.hasValues(sourceFeatures))
553            {
554               SeqFeature source = sourceFeatures.get(0);
555
556               List<FeatureQualifier> molTypeQualifiers = source.getQualifiers(GenBankFeatureQualifierName.mol_type.name());
557               if (CollectionUtil.hasValues(molTypeQualifiers))
558               {
559                  MolType molType = MolType.valueOf(molTypeQualifiers.get(0).getValue());
560                  if (molType != null)
561                  {
562                     ((BioSequencePlus) mCurrentSeq).setMolType(molType);
563                  }
564               }
565
566               // /db_xref="taxon:9606"
567               List<FeatureQualifier> dbXrefQualifiers = source.getQualifiers(GenBankFeatureQualifierName.db_xref.name());
568               if (CollectionUtil.hasValues(dbXrefQualifiers))
569               {
570                  for (FeatureQualifier qualifier : dbXrefQualifiers)
571                  {
572                     String[] pieces = qualifier.getValue().split(":");
573                     if (pieces[0].equals("taxon"))
574                     {
575                        ((BioSequencePlus) mCurrentSeq).setNCBITaxon(NCBITaxon.getByTaxonId(Integer.parseInt(pieces[1])));
576                        break;
577                     }
578                  }
579               }
580
581               List<FeatureQualifier> cloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.clone.name());
582               if (CollectionUtil.hasValues(cloneQualifiers))
583               {
584                  Clone clone = new Clone(cloneQualifiers.get(0).getValue());
585
586                  List<FeatureQualifier> subcloneQualifiers = source.getQualifiers(GenBankFeatureQualifierName.sub_clone.name());
587                  if (CollectionUtil.hasValues(subcloneQualifiers))
588                  {
589                     clone.setSubcloneName(subcloneQualifiers.get(0).getValue());
590                  }
591
592                  ((BioSequencePlus) mCurrentSeq).setClone(clone);
593               }
594            }
595         }
596      }
597
598   }
599
600   //---------------------------------------------------------------------------
601   private void parseLine(String inLine)
602         throws Exception
603   {
604      GenBankKeyword keyword = getLineKeyword(inLine);
605      if (keyword != null)
606      {
607         // Found the start of a new keyword field
608         finishPreviousKeyword();
609
610         mCurrentKeyword = keyword;
611         mCurrentSubkeyword = null;
612
613         parseField(inLine);
614      }
615      else
616      {
617         // Continuation of an existing field
618         if (GenBankKeyword.FEATURES.equals(mCurrentKeyword))
619         {
620            // Features have a special set of feature keys
621            parseFeatures(inLine);
622         }
623         else
624         {
625            GenBankSubkeyword subkeyword = getLineSubkeyword(inLine);
626            if (subkeyword != null)
627            {
628               // Start of a new subfield
629               mCurrentSubkeyword = subkeyword;
630            }
631            else
632            {
633               // Continuation of an existing subfield
634            }
635
636            parseField(inLine);
637         }
638      }
639   }
640
641   //---------------------------------------------------------------------------
642   private void parseField(String inLine)
643      throws Exception
644   {
645      if (mCurrentKeyword.equals(GenBankKeyword.LOCUS))
646      {
647         parseLocus(inLine);
648      }
649      else if (mCurrentKeyword.equals(GenBankKeyword.DEFINITION))
650      {
651         parseDefinition(inLine);
652      }
653      else if (mCurrentKeyword.equals(GenBankKeyword.VERSION))
654      {
655         parseVersion(inLine);
656      }
657      else if (mCurrentKeyword.equals(GenBankKeyword.KEYWORDS))
658      {
659         parseKeywords(inLine);
660      }
661      else if (mCurrentKeyword.equals(GenBankKeyword.SOURCE))
662      {
663         parseSource(inLine);
664      }
665      else if (mCurrentKeyword.equals(GenBankKeyword.REFERENCE))
666      {
667         parseReference(inLine);
668      }
669      else if (mCurrentKeyword.equals(GenBankKeyword.COMMENT))
670      {
671         parseComment(inLine);
672      }
673      else if (mCurrentKeyword.equals(GenBankKeyword.DBLINK))
674      {
675         parseDBLink(inLine);
676      }
677      else if (mCurrentKeyword.equals(GenBankKeyword.FEATURES))
678      {
679         parseFeatures(inLine);
680      }
681      else if (mCurrentKeyword.equals(GenBankKeyword.CONTIG))
682      {
683         parseContig(inLine);
684      }
685
686      // NID is skipped
687      // PROJECT is skipped
688      // SEGMENT is skipped
689      // BASE COUNT is skipped
690   }
691
692   //---------------------------------------------------------------------------
693   // Parse the LOCUS keyword line
694   // Ex:
695   // LOCUS       R88064                   460 bp    mRNA    linear   EST 16-AUG-1995
696   // LOCUS       pDR000029812            7616 bp    circular
697   // LOCUS       vDR\365                 8070 bp    DNA     circular     21-MAR-2011
698   //          PairwiseSeqAligner aligner = new PairwiseSeqAligner(alignmentSettings);
699
700   // Although it isn't always followed exactly, the detailed format for the LOCUS line format is as follows:
701   //
702   // Positions  Contents
703   // ---------  --------
704   //  01-05      'LOCUS'
705   //  06-12      spaces
706   //  13-28      Locus name
707   //  29-29      space
708   //  30-40      Length of sequence, right-justified
709   //  41-41      space
710   //  42-43      bp
711   //  44-44      space
712   //  45-47      spaces, ss- (single-stranded), ds- (double-stranded), or
713   //             ms- (mixed-stranded)
714   //  48-53      NA, DNA, RNA, tRNA (transfer RNA), rRNA (ribosomal RNA),
715   //             mRNA (messenger RNA), uRNA (small nuclear RNA).
716   //             Left justified.
717   //  54-55      space
718   //  56-63      'linear' followed by two spaces, or 'circular'
719   //  64-64      space
720   //  65-67      The division code
721   //  68-68      space
722   //  69-79      Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991)
723   //
724   private void parseLocus(String inLine)
725   {
726      Matcher m = sLocusPattern.matcher(inLine);
727      if (! m.matches())
728      {
729         throw new SeqFormatException("The " + GenBankKeyword.LOCUS + " line " + StringUtil.singleQuote(inLine) + " isn't in a recognized format!");
730      }
731
732      if (StringUtil.isSet(m.group(1)))
733      {
734         mCurrentSeq.setID(m.group(1));
735      }
736
737      mSeqLengthFromLocusLine = Integer.parseInt(m.group(2).trim());
738
739      // TODO: Group 3 is the detailed sequence type
740
741      if (mCurrentSeq instanceof BioSequencePlus)
742      {
743         if (StringUtil.isSet(m.group(4)))
744         {
745            ((BioSequencePlus) mCurrentSeq).setSeqTopology(SeqTopology.valueOf(m.group(4)));
746         }
747
748         if (StringUtil.isSet(m.group(5)))
749         {
750            ((BioSequencePlus) mCurrentSeq).setSeqRepositoryDivision(NCBIGenBankDivision.valueOf(m.group(5)));
751         }
752
753         if (StringUtil.isSet(m.group(6)))
754         {
755            try
756            {
757               ((BioSequencePlus) mCurrentSeq).setRevisionDate(mDateFormat.parse(m.group(6)));
758            }
759            catch (ParseException e)
760            {
761               // TODO: Add to warnings?
762               System.err.println(e.getMessage());
763            }
764         }
765      }
766   }
767
768   //---------------------------------------------------------------------------
769   private String getAccession(T inSeq)
770   {
771      String acc = inSeq.getID();
772      if (StringUtil.isSet(acc))
773      {
774         int index = acc.indexOf(".");
775         if (index > 0)
776         {
777            acc = acc.substring(0, index);
778         }
779      }
780      else
781      {
782         acc = "";
783      }
784
785      return acc;
786   }
787
788   //---------------------------------------------------------------------------
789   private void writeLocus(T inSeq, Writer inWriter)
790      throws IOException
791   {
792      inWriter.write(String.format("%5s       %-16.16s %11d %2.2s",
793                                   GenBankKeyword.LOCUS,
794                                   getAccession(inSeq),
795                                   inSeq.length(),
796                                   inSeq.getType().equals(BioSequenceType.PROTEIN) ? "aa" : "bp"));
797
798
799      if (inSeq instanceof BioSequencePlus)
800      {
801         BioSequencePlus bioSequencePlus = (BioSequencePlus) inSeq;
802
803         String molTypeString = "";
804         if (bioSequencePlus.getMolType() != null)
805         {
806            if (bioSequencePlus.getMolType().equals(MolType.genomic_DNA)
807                  || bioSequencePlus.getMolType().equals(MolType.unassigned_DNA)
808                  || bioSequencePlus.getMolType().equals(MolType.other_DNA))
809            {
810               molTypeString = "DNA";
811            }
812            else if (bioSequencePlus.getMolType().equals(MolType.genomic_RNA)
813                     || bioSequencePlus.getMolType().equals(MolType.transcribed_RNA)
814                     || bioSequencePlus.getMolType().equals(MolType.unassigned_RNA)
815                     || bioSequencePlus.getMolType().equals(MolType.other_RNA))
816            {
817               molTypeString = "RNA";
818            }
819            else if (bioSequencePlus.getMolType().equals(MolType.mRNA))
820            {
821               molTypeString = "mRNA";
822            }
823            else if (bioSequencePlus.getMolType().equals(MolType.tRNA))
824            {
825               molTypeString = "tRNA";
826            }
827         }
828
829         // TODO: 'ss-', 'ds-', or 'ms-' prefix for the mol type isn't parsed or output
830         inWriter.write(String.format("    %-6.6s %-8.8s %-3.3s %s",
831                                      molTypeString,
832                                      bioSequencePlus.getSeqTopology() != null ? bioSequencePlus.getSeqTopology() : "",
833                                      bioSequencePlus.getSeqRepositoryDivision() != null ? bioSequencePlus.getSeqRepositoryDivision().getCode() : "",
834                                      bioSequencePlus.getRevisionDate() != null ? mDateFormat.format(bioSequencePlus.getRevisionDate()).toUpperCase() : ""
835         ));
836      }
837
838      inWriter.write("\n");
839   }
840
841   //---------------------------------------------------------------------------
842   // Parse the DEFINITION keyword line
843   // Ex:
844   // DEFINITION  ym87c11.r1 Soares adult brain N2b4HB55Y Homo sapiens cDNA clone
845   //             IMAGE:165908 5', mRNA sequence.
846   //
847   private void parseDefinition(String inLine)
848   {
849      String field = inLine.substring(12).trim();
850
851      if (null == mCurrentSeq.getDescription())
852      {
853         mCurrentSeq.setDescription(field);
854      }
855      else
856      {
857         mCurrentSeq.setDescription(mCurrentSeq.getDescription() + " " + field);
858      }
859   }
860
861   //---------------------------------------------------------------------------
862   // Remove the trailing period
863   private void finishDefinition()
864   {
865      String definition = mCurrentSeq.getDescription();
866      if (definition != null
867            && definition.endsWith("."))
868      {
869         mCurrentSeq.setDescription(definition.substring(0, definition.length() - 1));
870      }
871   }
872
873   //---------------------------------------------------------------------------
874   private void writeDefinition(T inSeq, Writer inWriter)
875      throws IOException
876   {
877      if (StringUtil.isSet(inSeq.getDescription()))
878      {
879         String description = inSeq.getDescription();
880         if (! description.endsWith("."))
881         {
882            description += ".";
883         }
884
885         String[] lines = StringUtil.lines(StringUtil.wrap(description, 67));
886         for (int i = 0; i < lines.length; i++)
887         {
888            inWriter.write(String.format("%-10s  %s\n",
889                                         0 == i ? GenBankKeyword.DEFINITION : "",
890                                         lines[i]));
891         }
892      }
893   }
894
895   //---------------------------------------------------------------------------
896   private void writeAccession(T inSeq, Writer inWriter)
897      throws IOException
898   {
899      if (StringUtil.isSet(inSeq.getID()))
900      {
901         inWriter.write(String.format("%-9.9s   %s\n",
902                                      GenBankKeyword.ACCESSION,
903                                      getAccession(inSeq)));
904      }
905   }
906
907   //---------------------------------------------------------------------------
908   private void writeVersion(T inSeq, Writer inWriter)
909      throws IOException
910   {
911      if (StringUtil.isSet(inSeq.getID()))
912      {
913         inWriter.write(String.format("%-9.9s   %s\n",
914                                      GenBankKeyword.VERSION,
915                                      inSeq.getID()));
916      }
917   }
918
919   //---------------------------------------------------------------------------
920   // Parse the VERSION keyword line
921   // Ex:
922   // VERSION     AF181452.1  GI:6017929
923   //             ^^^^^^^^^^  ^^^^^^^^^^
924   //             Compound    NCBI GI
925   //             Accession   Identifier
926   //             Number
927   //
928   private void parseVersion(String inLine)
929   {
930      String fields[] = inLine.substring(12).trim().split("\\s+");
931
932      mCurrentSeq.setID(fields[0]);
933
934      if (fields.length > 1)
935      {
936         mCurrentSeq.setAttribute(NCBI_GI_ATTR, fields[1]);
937      }
938   }
939
940   //---------------------------------------------------------------------------
941   // Parse the KEYWORDS keyword line
942   // Ex:
943   // KEYWORDS    EST.
944   //
945   private void parseKeywords(String inLine)
946   {
947      String field = inLine.substring(12).trim();
948
949      if (field.endsWith("."))
950      {
951         field = field.substring(0, field.length() - 1);
952      }
953
954      if (mCurrentSeq instanceof BioSequencePlus)
955      {
956         ((BioSequencePlus) mCurrentSeq).addKeywords(field.split(",\\s+"));
957      }
958   }
959
960   //---------------------------------------------------------------------------
961   // Parse the SOURCE keyword line
962   // Ex:
963   // SOURCE      Homo sapiens (human)
964   //   ORGANISM  Homo sapiens
965   //             Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
966   //             Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
967   //             Catarrhini; Hominidae; Homo.
968   //
969   // The SOURCE field consists of two parts. The first part is found after
970   // the SOURCE keyword and contains free-format information including an
971   // abbreviated form of the organism name followed by a molecule type;
972   // multiple lines are allowed, but the last line must end with a period.
973   // The second part consists of information found after the ORGANISM
974   // subkeyword. The formal scientific name for the source organism (genus
975   // and species, where appropriate) is found on the same line as ORGANISM.
976   // The records following the ORGANISM line list the taxonomic
977   // classification levels, separated by semicolons and ending with a
978   // period.
979   //
980   private void parseSource(String inLine)
981   {
982      if (GenBankSubkeyword.ORGANISM.equals(mCurrentSubkeyword))
983      {
984         String field = inLine.substring(12).trim();
985         // For now, just keep the first line with the scientific name
986         if (inLine.trim().startsWith(GenBankSubkeyword.ORGANISM.name())
987             && mCurrentSeq instanceof BioSequencePlus)
988         {
989            Set<NCBITaxon> taxons = NCBITaxon.getByName(field);
990            if (CollectionUtil.hasValues(taxons))
991            {
992               NCBITaxon taxon;
993               if (taxons.size() > 1)
994               {
995                  // TODO: Refine with a better way to choose
996                  
997                  // Choose the one with highest id
998                  List<NCBITaxon> sortedTaxons = new ArrayList<>(taxons);
999                  Collections.sort(sortedTaxons);
1000                  taxon = sortedTaxons.get(sortedTaxons.size() - 1);
1001               }
1002               else
1003               {
1004                  taxon = taxons.iterator().next();
1005               }
1006
1007               ((BioSequencePlus) mCurrentSeq).setNCBITaxon(taxon);
1008            }
1009         }
1010      }
1011   }
1012
1013   //---------------------------------------------------------------------------
1014   // Parse the REFERENCE keyword line
1015   // Ex:
1016   // REFERENCE   1  (bases 1 to 342)
1017   //   AUTHORS   Giachino,C., Padovan,E. and Lanzavecchia,A.
1018   //   TITLE     kappa+lambda+ dual receptor B cells are present in the human
1019   //             peripheral repertoire
1020   //   JOURNAL   J. Exp. Med. 181 (3), 1245-1250 (1995)
1021   //    PUBMED   7869042
1022   //
1023   // Publications by the authors of the sequence that discuss the data reported in
1024   // the record. References are automatically sorted within the record based on date
1025   // of publication, showing the oldest references first.
1026   //
1027   // Some sequences have not been reported in papers and show a status of "unpublished"
1028   // or "in press". When an accession number and/or sequence data has appeared in print,
1029   // sequence authors should send the complete citation of the article to update@ncbi.nlm.nih.gov
1030   // and the GenBank staff will revise the record.
1031   //
1032   // Various classes of publication can be present in the References field, including
1033   // journal article, book chapter, book, thesis/monograph, proceedings chapter, proceedings
1034   // from a meeting, and patent.
1035   //
1036   // The last citation in the REFERENCE field usually contains information about the
1037   // submitter of the sequence, rather than a literature citation. It is therefore
1038   // called the "submitter block" and shows the words "Direct Submission" instead of
1039   // an article title. Additional information is provided below, under the header Direct
1040   // Submission. Some older records do not contain a submitter block.
1041   private void parseReference(String inLine)
1042      throws ParseException
1043   {
1044      if (inLine.startsWith(GenBankKeyword.REFERENCE.name()))
1045      {
1046         // REFERENCE   2  (bases 1 to 200000)
1047         mCurrentReference = new SeqCitation();
1048         // TODO: Set the reference seq location
1049
1050         if (mCurrentSeq instanceof BioSequencePlus)
1051         {
1052            ((BioSequencePlus) mCurrentSeq).addReference(mCurrentReference);
1053         }
1054
1055         Matcher m = sReferenceLocationPattern.matcher(inLine);
1056         if (m.find())
1057         {
1058            mCurrentReference.setSeqLocation(new SeqLocation(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2))));
1059         }
1060      }
1061      else
1062      {
1063         String field = inLine.substring(12).trim();
1064
1065         if (GenBankSubkeyword.AUTHORS.equals(mCurrentSubkeyword))
1066         {
1067            // Remove trailing comma if present.
1068            if (field.endsWith(","))
1069            {
1070               field = field.substring(0, field.length() - 1);
1071            }
1072            else if (field.endsWith(" and"))
1073            {
1074               field = field.substring(0, field.length() - 4);
1075            }
1076
1077            if (! field.equals("."))
1078            {
1079               String[] authorStrings = field.split("(,\\s+|\\s+and\\s+)");
1080               for (String authorString : authorStrings)
1081               {
1082                  mCurrentReference.addAuthor(new Author(authorString));
1083               }
1084            }
1085         }
1086         else if (GenBankSubkeyword.TITLE.equals(mCurrentSubkeyword))
1087         {
1088            String title = field;
1089            if (mCurrentReference.getTitle() != null)
1090            {
1091               title = mCurrentReference.getTitle() + " " + title;
1092            }
1093
1094            mCurrentReference.setTitle(title);
1095         }
1096         else if (GenBankSubkeyword.JOURNAL.equals(mCurrentSubkeyword))
1097         {
1098            mCurrentReference.appendRawContent(field);
1099
1100            String journal = field;
1101            if (mCurrentReference.getJournal() != null)
1102            {
1103               journal = mCurrentReference.getJournal().getTitle() + " " + journal;
1104            }
1105
1106            mCurrentReference.setJournal(new Journal(journal));
1107         }
1108         else if (GenBankSubkeyword.PUBMED.equals(mCurrentSubkeyword))
1109         {
1110            mCurrentReference.setPubMedId(field);
1111         }
1112         else if (GenBankSubkeyword.REMARK.equals(mCurrentSubkeyword))
1113         {
1114            String remark = field;
1115            if (mCurrentReference.getRemark() != null)
1116            {
1117               remark = mCurrentReference.getRemark() + " " + remark;
1118            }
1119
1120            mCurrentReference.setRemark(remark);
1121         }
1122      }
1123   }
1124   
1125   //---------------------------------------------------------------------------
1126   private void finishReference()
1127      throws ParseException
1128   {
1129      // Refine the citation based on the JOURNAL content
1130      if (null == mCurrentReference.getType()
1131          || mCurrentReference.getType().equals(CitationType.journal)) 
1132      {
1133         // "Patent: (\\w{2} \\S+)\\s+(\\d+)\\s+(\\d{2}-\\w{3}-\\d{4});(.+)?"
1134         Matcher m = sReferencePatentPattern.matcher(mCurrentReference.getJournal().getTitle());
1135         if (m.matches())
1136         {
1137            // It's not really a journal. It's a patent.
1138            mCurrentReference.setType(CitationType.patent);
1139            PatentData patentData = new PatentData()
1140                  .setTitle(mCurrentReference.getTitle())
1141                  .setInventors(mCurrentReference.getAuthors())
1142                  .setPublicationNum(m.group(1))
1143                  .setSeqIdNum(Integer.parseInt(m.group(2)))
1144                  .setPublicationDate(mDateFormat.parse(m.group(3)));
1145
1146            if (m.group(4) != null)
1147            {
1148               patentData.setApplicants(parsePatentApplicants(m.group(4)));
1149            }
1150
1151            mCurrentReference.setPatentData(patentData);
1152            mCurrentReference.setJournal(null);
1153         }
1154         else
1155         {
1156            // If it's a direct submission of sequences, extract the submission date and contact info
1157            m = sReferenceDirectSubmissionPattern.matcher(mCurrentReference.getJournal().getTitle());
1158            if (m.matches())
1159            {
1160               mCurrentReference.setSubmissionDate(DateUtil.threadsafeParse(m.group(1), sDateFormat));
1161               mCurrentReference.setYear(Integer.parseInt(m.group(2)));
1162               mCurrentReference.setContactInfo(m.group(3));
1163               mCurrentReference.setJournal(null);
1164            }
1165            else
1166            {
1167               // Is it a thesis?
1168               m = sReferenceThesisPattern.matcher(mCurrentReference.getJournal().getTitle());
1169               if (m.matches())
1170               {
1171                  mCurrentReference.setType(CitationType.thesis);
1172                  
1173                  String yearString = m.group(1);
1174                  if (yearString != null)
1175                  {
1176                     mCurrentReference.setYear(Integer.parseInt(yearString));
1177                  }
1178                  
1179                  mCurrentReference.setInstitution(m.group(2));
1180                  mCurrentReference.setJournal(null);
1181               }
1182               else if (mCurrentReference.getJournal().getTitle().startsWith("Published Only in Database"))
1183               {
1184                  mCurrentReference.setType(CitationType.online_database);
1185                  mCurrentReference.setJournal(null);
1186               }
1187               else 
1188               {
1189                  m = sReferenceJournalPattern.matcher(mCurrentReference.getJournal().getTitle());
1190                  if (m.matches())
1191                  {
1192                     // OK, it's a journal. Fill out the rest of the journal-related fields.
1193
1194                     mCurrentReference.setType(CitationType.journal);
1195                     mCurrentReference.setJournal(new Journal(m.group(1)));
1196                     mCurrentReference.setVolume(m.group(2));
1197                     mCurrentReference.setIssue(m.group(3));
1198
1199                     // Group 4 might be pages or an article number
1200                     String pages = m.group(4);
1201                     if (pages != null
1202                           && pages.toUpperCase().startsWith("E"))
1203                     {
1204                        mCurrentReference.setArticleNumber(pages);
1205                     }
1206                     else
1207                     {
1208                        mCurrentReference.setPages(pages);
1209                     }
1210
1211                     String year = m.group(5);
1212                     if (year != null)
1213                     {
1214                        mCurrentReference.setYear(Integer.parseInt(year));
1215                     }
1216                  }
1217               }
1218            }
1219         }
1220      }
1221   }
1222
1223   //---------------------------------------------------------------------------
1224   // Because of a lack of format controls, it's nearly impossible to parse this
1225   // content correctly. The best approach would be some sort of NLP.
1226   // This method is protected instead of private to allow unit testing.
1227   protected List<String> parsePatentApplicants(String inApplicantString)
1228   {
1229      List<String> applicants = new ArrayList<>(3);
1230
1231      // Values should be separated by semi-colons
1232      if (inApplicantString.contains(";"))
1233      {
1234         String[] pieces = inApplicantString.split(";");
1235         for (int i = 0; i < pieces.length; i++)
1236         {
1237            pieces[i] = pieces[i].trim();
1238         }
1239
1240         // Ends with a location?
1241         if (pieces.length > 1
1242               && pieces.length <= 3
1243               && (2 == pieces[pieces.length - 1].length() // Ends in a two letter country code?
1244                   || sPatentLocationPattern.matcher(pieces[pieces.length - 1]).matches() // ', \\w{2}'
1245                   || ! pieces[pieces.length - 1].contains(" "))) // Last piece is a single word (probably a city name)
1246         {
1247            applicants.add(StringUtil.join(pieces,", "));
1248         }
1249         else
1250         {
1251            for (String piece : pieces)
1252            {
1253               applicants.add(piece);
1254            }
1255         }
1256      }
1257      else
1258      {  // Sometimes the values are separated by commas
1259         String[] pieces = inApplicantString.split(",");
1260         for (String piece : pieces)
1261         {
1262            piece = piece.trim();
1263            if (piece.length() > 0)
1264            {
1265               String ucPiece = piece.toUpperCase();
1266
1267               // Is it a single word or a company suffix?
1268               if (applicants.size() > 0
1269                     && (! piece.contains(" ")
1270                     || ucPiece.startsWith("INC ")
1271                     || ucPiece.startsWith("INC. ")
1272                     || ucPiece.startsWith("LLC ")
1273                     || (sPatentParensLocationPattern.matcher(ucPiece).matches()
1274                         && applicants.size() > 0
1275                         && ! sPatentParensLocationPattern.matcher(applicants.get(applicants.size() - 1)).matches())))
1276               {
1277                  // Add it to the previous piece
1278                  int lastIndex = applicants.size() - 1;
1279                  applicants.set(lastIndex, applicants.get(lastIndex) + ", " + piece);
1280               }
1281               else
1282               {
1283                  applicants.add(piece.trim());
1284               }
1285            }
1286         }
1287      }
1288
1289      return applicants;
1290   }
1291   
1292   //---------------------------------------------------------------------------
1293   // Parse the COMMENT keyword line
1294   // Ex:
1295   // COMMENT     Contact: Wilson RK
1296   //             Washington University School of Medicine
1297   //             4444 Forest Park Parkway, Box 8501, St. Louis, MO 63108
1298   //             Tel: 314 286 1800
1299   //             Fax: 314 286 1810
1300   //             Email: est@watson.wustl.edu
1301   //             Insert Size: 1482
1302   //             High quality sequence stops: 353 Source: IMAGE Consortium, LLNL
1303   //             This clone is available royalty-free through LLNL ; contact the
1304   //             IMAGE Consortium (info@image.llnl.gov) for further information.
1305   //             Insert Length: 1482   Std Error: 0.00
1306   //             Seq primer: M13RP1
1307   //             High quality sequence stop: 353.
1308   //
1309   private void parseComment(String inLine)
1310   {
1311      String field = inLine.substring(12).trim();
1312
1313      if (mCurrentSeq.getAttribute(COMMENT_ATTR) != null)
1314      {
1315         mCurrentSeq.setAttribute(COMMENT_ATTR, mCurrentSeq.getAttribute(COMMENT_ATTR) + "\n" + field);
1316      }
1317      else
1318      {
1319         mCurrentSeq.setAttribute(COMMENT_ATTR, field);
1320      }
1321   }
1322
1323   //---------------------------------------------------------------------------
1324   // Parse the DBLINK keyword line
1325   // Ex:
1326   // DBLINK      BioProject:PRJNA174162,PRJNA999998,PRJNA999999
1327   //             BioSample: SAMN01795900
1328   //
1329   // "This line contains cross-references to other underlying resources that
1330   // support the existence of a GenBank sequence record...
1331   // A DBLINK cross-reference consists of two data fields delimited by a colon.
1332   // The first field provides the cross-reference type ("BioProject"), while the
1333   // second contains the actual cross-reference identifier ("PRJNA177352").
1334   // The second field can consist of multiple comma-separated identifiers,
1335   // if a sequence record has multiple DBLINK cross-references of a given type."
1336   //
1337   private void parseDBLink(String inLine)
1338   {
1339      if (mCurrentSeq instanceof BioSequencePlus)
1340      {
1341         BioSequencePlus sequencePlus = (BioSequencePlus) mCurrentSeq;
1342         String field = inLine.substring(12).trim();
1343         String[] pieces = field.split(":");
1344         if (2 == pieces.length)
1345         {
1346            String[] values = pieces[1].split(",");
1347            for (String value : values)
1348            {
1349               sequencePlus.addDbXref(new DbXref(pieces[0].trim(), value.trim()));
1350            }
1351         }
1352         else
1353         {
1354            // Continuation of previous db identifiers
1355            String db = sequencePlus.getDbXrefs().get(sequencePlus.getDbXrefs().size() - 1).getDB();
1356            String[] values = field.split(",");
1357            for (String value : values)
1358            {
1359               sequencePlus.addDbXref(new DbXref(db, value.trim()));
1360            }
1361         }
1362      }
1363   }
1364
1365   //---------------------------------------------------------------------------
1366   private void writeDBLinks(List<DbXref> inDBXrefs, Writer inWriter)
1367         throws IOException
1368   {
1369      Map<String, StringBuilderPlus> xRefMap = new OrderedMap<>(4);
1370
1371      for (DbXref xref : inDBXrefs)
1372      {
1373         StringBuilderPlus line = xRefMap.get(xref.getDB());
1374         if (null == line)
1375         {
1376            line = new StringBuilderPlus(xref.getDB() + ":" + xref.getId());
1377            xRefMap.put(xref.getDB(), line);
1378         }
1379         else
1380         {
1381            line.delimitedAppend(xref.getId());
1382         }
1383      }
1384
1385      int count = 0;
1386      for (String db : xRefMap.keySet())
1387      {
1388         String[] lines = StringUtil.lines(StringUtil.wrap(xRefMap.get(db).toString(), 67));
1389
1390         for (String line : lines)
1391         {
1392            count++;
1393            inWriter.write(String.format("%-12.12s%s", (1 == count ? GenBankKeyword.DBLINK : ""), line));
1394         }
1395      }
1396   }
1397
1398   //---------------------------------------------------------------------------
1399   // Parse the CONTIG keyword line
1400   // Ex:
1401   // CONTIG      join(D86993.1:7160..39752,D87004.2:803..13993)
1402   //
1403   private void parseContig(String inLine)
1404   {
1405      String field = inLine.substring(12).trim();
1406
1407      if (mCurrentSeq.getAttribute(CONTIG_ATTR) != null)
1408      {
1409         mCurrentSeq.setAttribute(CONTIG_ATTR, mCurrentSeq.getAttribute(CONTIG_ATTR) + field);
1410      }
1411      else
1412      {
1413         mCurrentSeq.setAttribute(CONTIG_ATTR, field);
1414      }
1415   }
1416
1417   //---------------------------------------------------------------------------
1418   private void parseFeatures(String inLine)
1419   {
1420      if (! inLine.startsWith(GenBankKeyword.FEATURES.name()))
1421      {
1422         // Is there a feature key on this line?
1423         String featureKeyString = inLine.substring(5, 20).trim();
1424         if (StringUtil.isSet(featureKeyString))
1425         {
1426            GenBankFeatureKey featureKey = GenBankFeatureKey.valueOf(featureKeyString);
1427            if (null == featureKey)
1428            {
1429               throw new SeqFormatException(StringUtil.singleQuote(featureKeyString) + " is not a recognized feature key!");
1430            }
1431
1432            String locationString = inLine.substring(21).trim();
1433            mCurrentFeature = new GenBankFeature(featureKey, new GenBankFeatureLocation(locationString));
1434            if (mCurrentSeq instanceof BioSequencePlus)
1435            {
1436               ((BioSequencePlus) mCurrentSeq).addFeature(mCurrentFeature);
1437            }
1438
1439            // Unquote the previous qualifier if necessary
1440            if (mCurrentFeatureQualifier != null
1441                  && mCurrentFeatureQualifier.getValue().startsWith("\""))
1442            {
1443               mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue()));
1444            }
1445            mCurrentFeatureQualifier = null;
1446         }
1447         else
1448         {
1449            String content = inLine.substring(21).trim();
1450
1451            Matcher m = sFeatureQualifierPattern.matcher(content);
1452            if (m.matches())
1453            {
1454               // New qualifier
1455
1456               // Unquote the previous qualifier if necessary
1457               if (mCurrentFeatureQualifier != null
1458                   && mCurrentFeatureQualifier.getValue().startsWith("\""))
1459               {
1460                  mCurrentFeatureQualifier.setValue(StringUtil.unquote(mCurrentFeatureQualifier.getValue()));
1461               }
1462
1463               GenBankFeatureQualifierName qualifierName = GenBankFeatureQualifierName.valueOf(m.group(1));
1464               if (null == qualifierName)
1465               {
1466                  throw new SeqFormatException(StringUtil.singleQuote(m.group(1)) + " is not a recognized qualifier!");
1467               }
1468
1469               mCurrentFeatureQualifier = new GenBankFeatureQualifier(qualifierName);
1470               mCurrentFeature.addQualifier(mCurrentFeatureQualifier);
1471
1472               String value = m.group(2);
1473               if (value != null)
1474               {
1475                  mCurrentFeatureQualifier.appendToValue(value);
1476               }
1477            }
1478            else if (mCurrentFeatureQualifier != null)
1479            {
1480               // Continuation of a previous qualifier
1481               mCurrentFeatureQualifier.appendToValue(content);
1482            }
1483            else if (mCurrentFeature != null)
1484            {
1485               // Continuation of a feature location
1486               mCurrentFeature.getLocation().append(content);
1487            }
1488         }
1489      }
1490   }
1491
1492   //---------------------------------------------------------------------------
1493   private void writeReferences(List<SeqCitation> inSeqCitations, Writer inWriter)
1494      throws IOException
1495   {
1496      int count = 0;
1497      for (SeqCitation citation : inSeqCitations)
1498      {
1499         count++;
1500
1501         inWriter.write(String.format("%s   %-3d%s\n",
1502                                      GenBankKeyword.REFERENCE,
1503                                      count,
1504                                      citation.getSeqLocation() != null ? "(bases " + citation.getSeqLocation().getStart() + " to " + citation.getSeqLocation().getEnd() + ")" : ""));
1505
1506         // Authors
1507         StringBuilderPlus authors = new StringBuilderPlus().setDelimiter(", ");
1508         for (int i = 0; i < citation.getAuthors().size(); i++)
1509         {
1510            Author author = citation.getAuthors().get(i);
1511            if (citation.getAuthors().size() > 1
1512                  && i == citation.getAuthors().size() - 1)
1513            {
1514               authors.append(" and " + author.getLastName() + "," + author.getFirstInitial() + ".");
1515            }
1516            else
1517            {
1518               authors.delimitedAppend(author.getLastName() + "," + author.getFirstInitial() + ".");
1519            }
1520         }
1521         String[] lines = StringUtil.lines(StringUtil.wrap(authors.toString(), 67));
1522         for (int i = 0; i < lines.length; i++)
1523         {
1524            inWriter.write(String.format("  %-7.7s   %s\n",
1525                                         0 == i ? GenBankSubkeyword.AUTHORS.name() : "",
1526                                         lines[i]));
1527         }
1528
1529         // Title
1530         lines = StringUtil.lines(StringUtil.wrap(citation.getTitle(), 67));
1531         for (int i = 0; i < lines.length; i++)
1532         {
1533            inWriter.write(String.format("  %-5.5s     %s\n",
1534                                         0 == i ? GenBankSubkeyword.TITLE.name() : "",
1535                                         lines[i]));
1536         }
1537
1538         // Journal
1539         // JOURNAL   J. Exp. Med. 188 (11), 2151-2162 (1998)
1540         StringBuilderPlus journal = new StringBuilderPlus();
1541         if (StringUtil.isSet(citation.toString()))
1542         {
1543            journal.append(citation.toString());
1544         }
1545         else
1546         {
1547            journal.append(citation.getJournal())
1548                  .append(".")
1549                  .append(citation.getVolume() != null ? " " + citation.getVolume() : "")
1550                  .append(citation.getIssue() != null ? " (" + citation.getIssue() + ")" : "")
1551                  .append(citation.getPages() != null ? ", " + citation.getPages() : "")
1552                  .append(citation.getYear() != null ? " (" + citation.getYear() + ")" : "");
1553         }
1554
1555         lines = StringUtil.lines(StringUtil.wrap(journal.toString(), 67));
1556         for (int i = 0; i < lines.length; i++)
1557         {
1558            inWriter.write(String.format("  %-7.7s   %s\n",
1559                                         0 == i ? GenBankSubkeyword.JOURNAL.name() : "",
1560                                         lines[i]));
1561         }
1562
1563         // Pubmed id
1564         if (StringUtil.isSet(citation.getPubMedId()))
1565         {
1566            inWriter.write(String.format("  %-6.6s    %s\n",
1567                                         GenBankSubkeyword.PUBMED.name(),
1568                                         citation.getPubMedId()));
1569         }
1570
1571         // Remark
1572         if (StringUtil.isSet(citation.getRemark()))
1573         {
1574            lines = StringUtil.lines(StringUtil.wrap(citation.getRemark(), 67));
1575            for (int i = 0; i < lines.length; i++)
1576            {
1577               inWriter.write(String.format("  %-6.6s    %s\n",
1578                                            0 == i ? GenBankSubkeyword.REMARK.name() : "",
1579                                            lines[i]));
1580            }
1581         }
1582
1583      }
1584   }
1585
1586   //---------------------------------------------------------------------------
1587   private void writeFeature(SeqFeature inSeqFeature, Writer inWriter)
1588      throws IOException
1589   {
1590
1591      String[] lines = StringUtil.lines(StringUtil.wrap(inSeqFeature.getLocation().toString(), 58));
1592      for (int i = 0; i < lines.length; i++)
1593      {
1594         inWriter.write(String.format("     %-15.15s %s\n",
1595                                      0 == i ? inSeqFeature.name() : "",
1596                                      lines[i]));
1597      }
1598
1599      if (CollectionUtil.hasValues(inSeqFeature.getQualifiers()))
1600      {
1601         for (FeatureQualifier qualifier : inSeqFeature.getQualifiers())
1602         {
1603            String qualifierString = "/" + qualifier.name();
1604            if (StringUtil.isSet(qualifier.getValue()))
1605            {
1606               qualifierString += "=\"" + qualifier.getValue() + "\"";
1607            }
1608
1609            lines = StringUtil.lines(StringUtil.wrap(qualifierString, 58));
1610            for (int i = 0; i < lines.length; i++)
1611            {
1612               inWriter.write(String.format("                     %s\n", lines[i]));
1613            }
1614         }
1615      }
1616   }
1617
1618   //###########################################################################
1619   // INNER CLASS
1620   //###########################################################################
1621
1622   class GenBankSeqFilterReader extends LettersOnlyReader
1623   {
1624      //---------------------------------------------------------------------------
1625      public GenBankSeqFilterReader(Reader inReader)
1626      {
1627         super(inReader);
1628      }
1629
1630      //---------------------------------------------------------------------------
1631      @Override
1632      public int read()
1633            throws IOException
1634      {
1635         int returnChar;
1636
1637         do
1638         {
1639            returnChar = innerRead();
1640         }
1641         while (returnChar >= 0
1642                && (Character.isWhitespace(returnChar)
1643                    || Character.isDigit(returnChar)
1644                    || returnChar == '/'));
1645
1646         return returnChar;
1647      }
1648   }
1649
1650}