001package com.hfg.bio.taxonomy.ncbi;
002
003import java.io.*;
004import java.util.*;
005import java.util.logging.Level;
006import java.util.logging.Logger;
007import java.util.zip.GZIPInputStream;
008
009import com.hfg.util.StringBuilderPlus;
010import com.hfg.util.StringUtil;
011import com.hfg.util.collection.CollectionUtil;
012
013//------------------------------------------------------------------------------
014/**
015 * Species class based on the NCBI taxonomy data.
016 * A default set of files are included as data sources. To minimize load time and
017 * memory the initial default data source has a few common values. If a value is
018 * requested that isn't found in this set, a second, more complete but not fully
019 * up-to-date data source is loaded and the lightweight data source is discarded.
020 * An additional more up-to-date data source can be manually added as an
021 * NCBIRemoteTaxonomyDataSource.
022 * <div>
023 *  @author J. Alex Taylor, hairyfatguy.com
024 * </div>
025 */
026//------------------------------------------------------------------------------
027// com.hfg XML/HTML Coding Library
028//
029// This library is free software; you can redistribute it and/or
030// modify it under the terms of the GNU Lesser General Public
031// License as published by the Free Software Foundation; either
032// version 2.1 of the License, or (at your option) any later version.
033//
034// This library is distributed in the hope that it will be useful,
035// but WITHOUT ANY WARRANTY; without even the implied warranty of
036// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
037// Lesser General Public License for more details.
038//
039// You should have received a copy of the GNU Lesser General Public
040// License along with this library; if not, write to the Free Software
041// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
042//
043// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
044// jataylor@hairyfatguy.com
045//------------------------------------------------------------------------------
046
047public class NCBITaxon implements Comparable<NCBITaxon>
048{
049   private static final Logger LOGGER = Logger.getLogger(NCBITaxon.class.getPackage().getName());
050
051   // Data sources should be arranged such that smaller/lighter sources are tried first
052   // and only if a match is not found do we move to the next "heavier" implementation.
053   // Two data sources are provided by default - the first with a few common values and
054   // the second which is a mostly complete (but old) taxonomy. If an up-to-date taxonomy
055   // is desired, an NCBIRemoteTaxonomyDataSource should be added as a data source.
056   private static final List<NCBITaxonomyDataSource> mDataSources = new ArrayList<>(3);
057
058   static
059   {
060      addDataSource(new CommonInternalDataSource());
061      addDataSource(new FullInternalDataSource());
062   }
063
064   // Shortcuts to some common organisms
065   /** Human */
066   public static final NCBITaxon HOMO_SAPIENS             = new NCBITaxon(9606);
067   /** Mouse */
068   public static final NCBITaxon MUS_MUSCULUS             = new NCBITaxon(10090);
069   /** Rat */
070   public static final NCBITaxon RATTUS_NORVEGICUS        = new NCBITaxon(10116);
071   /** Hamster */
072   public static final NCBITaxon CRICETULUS_GRISEUS       = new NCBITaxon(10029);
073   /** Rabbit */
074   public static final NCBITaxon ORYCTOLAGUS_CUNICULUS    = new NCBITaxon(9986);
075   /** Cow */
076   public static final NCBITaxon BOS_TAURUS               = new NCBITaxon(9913);
077   /** Horse */
078   public static final NCBITaxon EQUUS_CABALLUS           = new NCBITaxon(9796);
079   /** Pig */
080   public static final NCBITaxon SUS_SCROFA               = new NCBITaxon(9823);
081   /** Sheep */
082   public static final NCBITaxon OVIS_ARIES               = new NCBITaxon(9940);
083   /** Xenopus (African clawed frog) */
084   public static final NCBITaxon XENOPUS_LAEVIS           = new NCBITaxon(8355);
085   /** Drosophila (Fruit fly) */
086   public static final NCBITaxon DROSOPHILA_MELANOGASTER  = new NCBITaxon(7227);
087   /** E. Coli */
088   public static final NCBITaxon ESCHERICHIA_COLI         = new NCBITaxon(562);
089   /** Yeast */
090   public static final NCBITaxon SACCHAROMYCES_CEREVISIAE = new NCBITaxon(4932);
091   /** Dog */
092   public static final NCBITaxon CANIS_FAMILIARIS         = new NCBITaxon(9615);
093   /** Chimpanzee */
094   public static final NCBITaxon PAN_TROGLODYTES          = new NCBITaxon(9598);
095   /** Rhesus monkey */
096   public static final NCBITaxon MACACA_MULATTA           = new NCBITaxon(9544);
097   /** Camel */
098   public static final NCBITaxon CAMELUS_BACTRIANUS       = new NCBITaxon(9837);
099   /** Dromedary */
100   public static final NCBITaxon CAMELUS_DROMEDARIUS      = new NCBITaxon(9838);
101   /** Llama */
102   public static final NCBITaxon LAMA_GLAMA               = new NCBITaxon(9844);
103   /** Alpaca */
104   public static final NCBITaxon VICUGNA_PACOS            = new NCBITaxon(30538);
105
106   // If you add to these common defs, add to sCommonSet below and regenerate the short dump files.
107
108   /** Mammals */
109   public static final NCBITaxon MAMMALS                  = new NCBITaxon(40674);
110   /** Primates */
111   public static final NCBITaxon PRIMATES                 = new NCBITaxon(9443);
112   /** Rodents */
113   public static final NCBITaxon RODENTS                  = new NCBITaxon(9989);
114
115   /** Unknown / unidentified */
116   public static final NCBITaxon UNKNOWN                  = new NCBITaxon(32644);
117
118   /** Synthetic construct / artificial sequence */
119   public static final NCBITaxon SYNTHETIC_CONSTRUCT      = new NCBITaxon(32630);
120
121   //**************************************************************************
122   // PRIVATE FIELDS
123   //**************************************************************************
124
125   private boolean             mInitialized;
126   private int                 mTaxonId;
127   private int                 mParentTaxonId;
128   private String              mScientificName;
129   private String              mCommonName;
130   private String              mGenBankCommonName;
131   private Set<String>         mSynonyms;
132   private NCBITaxonNodeRank   mNodeRank = NCBITaxonNodeRank.NO_RANK;
133   private String              mEMBL_Code;
134   private NCBIGenBankDivision mDivision;
135   private Boolean             mInheritedDivisionFlag;
136   private NCBIGeneticCode     mGeneticCode;
137   private Boolean             mInheritedGeneticCodeFlag;
138   private NCBIGeneticCode     mMitochondrialGeneticCode;
139   private Boolean             mInheritedMitochondrialGeneticCodeFlag;
140   private Boolean             mGenBankHiddenFlag;
141   private Boolean             mHiddenSubtreeRootFlag;
142   private String              mComments;
143
144
145   private static final Set<NCBITaxon> sCommonSet = new HashSet<>();
146
147   private static final String NODES_FILE = "rsrc/nodes.dmp.gz";
148   private static final String NAMES_FILE = "rsrc/names.dmp.gz";
149   private static final String COMMON_NODES_FILE = "rsrc/nodes_short.dmp.gz";
150   private static final String COMMON_NAMES_FILE = "rsrc/names_short.dmp.gz";
151
152   private static final String NL = System.getProperty("line.separator");
153
154   static
155   {
156      sCommonSet.add(HOMO_SAPIENS);
157      sCommonSet.add(MUS_MUSCULUS);
158      sCommonSet.add(RATTUS_NORVEGICUS);
159      sCommonSet.add(CRICETULUS_GRISEUS);
160      sCommonSet.add(ORYCTOLAGUS_CUNICULUS);
161      sCommonSet.add(BOS_TAURUS);
162      sCommonSet.add(EQUUS_CABALLUS);
163      sCommonSet.add(SUS_SCROFA);
164      sCommonSet.add(OVIS_ARIES);
165      sCommonSet.add(DROSOPHILA_MELANOGASTER);
166      sCommonSet.add(ESCHERICHIA_COLI);
167      sCommonSet.add(XENOPUS_LAEVIS);
168      sCommonSet.add(SACCHAROMYCES_CEREVISIAE);
169      sCommonSet.add(CANIS_FAMILIARIS);
170      sCommonSet.add(PAN_TROGLODYTES);
171      sCommonSet.add(MACACA_MULATTA);
172      sCommonSet.add(CAMELUS_BACTRIANUS);
173      sCommonSet.add(CAMELUS_DROMEDARIUS);
174      sCommonSet.add(LAMA_GLAMA);
175      sCommonSet.add(VICUGNA_PACOS);
176      sCommonSet.add(MAMMALS);
177      sCommonSet.add(PRIMATES);
178      sCommonSet.add(RODENTS);
179      sCommonSet.add(SYNTHETIC_CONSTRUCT);
180      sCommonSet.add(UNKNOWN);
181   }
182
183
184   //**************************************************************************
185   // CONSTRUCTORS
186   //**************************************************************************
187
188   //--------------------------------------------------------------------------
189   public NCBITaxon(int inTaxonId)
190   {
191      mTaxonId = inTaxonId;
192   }
193
194   //**************************************************************************
195   // PUBLIC FUNCTIONS
196   //**************************************************************************
197
198   //---------------------------------------------------------------------------
199   public static Logger getLogger()
200   {
201      return LOGGER;
202   }
203
204   //--------------------------------------------------------------------------
205   public static void addDataSource(NCBITaxonomyDataSource inValue)
206   {
207      mDataSources.add(inValue);
208   }
209
210   //--------------------------------------------------------------------------
211   public static void addDataSource(int inIndex, NCBITaxonomyDataSource inValue)
212   {
213      mDataSources.add(inIndex, inValue);
214   }
215
216   //--------------------------------------------------------------------------
217   public static void setDataSource(NCBITaxonomyDataSource inValue)
218   {
219      mDataSources.clear();
220      mDataSources.add(inValue);
221   }
222
223   //--------------------------------------------------------------------------
224   /**
225    * Retrieves the NCBITaxon for the specified common name, scientific name,
226    * or GenBank common name. Generally there will be a single taxon found for a given
227    * name, but there are instances where multiple taxons may be found.
228    @param inValue the species name (common or scientific) for the taxon object to return
229    @return a Set of taxon objects corresponding to the specified name. Returns null if a match cannot be found.
230    */
231   public static synchronized Set<NCBITaxon> getByName(String inValue)
232   {
233      Set<NCBITaxon> taxons = null;
234      if (StringUtil.isSet(inValue))
235      {
236         // Lowercase the value so we can compare the names case-insensitively.
237         inValue = inValue.toLowerCase();
238
239         for (int i = 0; i < mDataSources.size(); i++)
240         {
241            NCBITaxonomyDataSource dataSource = mDataSources.get(i);
242
243            taxons = dataSource.getByName(inValue);
244            if (null == taxons
245                & i < mDataSources.size() - 1)
246            {
247               // The requested id wasn't found in the data source.
248               // Data sources should be provided in increasing size so
249               // if this isn't the last data source, jetison it and move
250               // to the next one.
251               mDataSources.remove(i--);
252            }
253            else
254            {
255               break;
256            }
257         }
258      }
259
260      if (taxons != null)
261      {
262         for (NCBITaxon taxon : taxons)
263         {
264            taxon.mInitialized = true;
265         }
266      }
267
268      return taxons;
269   }
270
271   //--------------------------------------------------------------------------
272   /**
273    Returns the taxon for the specified NCBI taxon id.
274    @param inValue the taxon id for the taxon object to retrieve
275    @return the taxon object corresponding to the specified id
276    */
277   public static synchronized NCBITaxon getByTaxonId(int inValue)
278   {
279      NCBITaxon taxon = null;
280
281      for (int i = 0; i < mDataSources.size(); i++)
282      {
283         NCBITaxonomyDataSource dataSource = mDataSources.get(i);
284
285         taxon = dataSource.getByTaxonId(inValue);
286         if (null == taxon
287             & i < mDataSources.size() - 1)
288         {
289            // The requested id wasn't found in the data source.
290            // Data sources should be provided in increasing size so
291            // if this isn't the last data source, jetison it and move
292            // to the next one.
293            mDataSources.remove(i--);
294            resetCommonTaxons();
295         }
296         else
297         {
298            break;
299         }
300      }
301      
302      if (taxon != null)
303      {
304         taxon.mInitialized = true;
305      }
306
307      return taxon;
308   }
309
310   //--------------------------------------------------------------------------
311   /**
312    Returns an unmodifiable Collection of the common taxons (those defined as class constants).
313    @return the small collection of frequently used taxon objects
314    */
315   public static Collection<NCBITaxon> getCommonSet()
316   {
317      return Collections.unmodifiableCollection(sCommonSet);
318   }
319
320   //--------------------------------------------------------------------------
321   @Override
322   public String toString()
323   {
324      StringBuilder buffer = new StringBuilder();
325      buffer.append(mTaxonId);
326      buffer.append(" ");
327
328      buffer.append(mScientificName);
329
330      if (mGenBankCommonName != null)
331      {
332         buffer.append(" (");
333         buffer.append(mGenBankCommonName);
334         buffer.append(")");
335      }
336
337      return buffer.toString();
338   }
339
340   //--------------------------------------------------------------------------
341   public int getTaxonId()
342   {
343      return mTaxonId;
344   }
345
346   //--------------------------------------------------------------------------
347   public String getFullTaxonomy()
348   {
349      if (! mInitialized)
350      {
351         init();
352      }
353
354      StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter("; ");
355
356      if (mParentTaxonId != 1)
357      {
358         NCBITaxon parentTaxon = getParentTaxon();
359         buffer.append(parentTaxon.getFullTaxonomy());
360      }
361
362      if (getTaxonomyRank() != NCBITaxonNodeRank.NO_RANK)
363      {
364         buffer.delimitedAppend(getTaxonomyRank());
365         buffer.append(" ");
366         buffer.append(getScientificName());
367      }
368
369      return buffer.toString();
370   }
371
372   //--------------------------------------------------------------------------
373   public boolean isSubtaxonOf(NCBITaxon inTaxon2)
374   {
375      boolean result = false;
376      NCBITaxon currentTaxon = this;
377
378      while (currentTaxon != null
379             && currentTaxon.getTaxonId() != 1)
380      {
381         currentTaxon = currentTaxon.getParentTaxon();
382
383         if (currentTaxon != null
384             && currentTaxon.equals(inTaxon2))
385         {
386            result = true;
387            break;
388         }
389      }
390
391      return result;
392   }
393
394   //--------------------------------------------------------------------------
395   public NCBITaxon getFirstCommonTaxon(NCBITaxon inTaxon2)
396   {
397      NCBITaxon firstCommonTaxon = null;
398
399      Set<NCBITaxon> taxonSet = new HashSet<>();
400
401      NCBITaxon currentTaxon = this;
402      while (currentTaxon != null)
403      {
404         taxonSet.add(currentTaxon);
405         currentTaxon = currentTaxon.getParentTaxon();
406      }
407
408
409      // Now walk up the 2nd taxon's branch until we find a taxon in common.
410      currentTaxon = inTaxon2;
411      while (currentTaxon != null)
412      {
413         if (taxonSet.contains(currentTaxon))
414         {
415            firstCommonTaxon = currentTaxon;
416            break;
417         }
418
419         currentTaxon = currentTaxon.getParentTaxon();
420      }
421
422      return firstCommonTaxon;
423   }
424
425   //--------------------------------------------------------------------------
426   public NCBITaxon getParentTaxon()
427   {
428      return getByTaxonId(getParentTaxonId());
429   }
430
431   //--------------------------------------------------------------------------
432   public NCBITaxon setParentTaxonId(int inValue)
433   {
434      // It can't be its own parent.
435      if (inValue != mTaxonId) mParentTaxonId = inValue;
436      return this;
437   }
438
439
440   //--------------------------------------------------------------------------
441   public int getParentTaxonId()
442   {
443      if (! mInitialized)
444      {
445         init();
446      }
447
448      return mParentTaxonId;
449   }
450
451   //--------------------------------------------------------------------------
452   public String getScientificName()
453   {
454      if (! mInitialized)
455      {
456         init();
457      }
458
459      return mScientificName;
460   }
461
462   //--------------------------------------------------------------------------
463   public NCBITaxon setScientificName(String inValue)
464   {
465      mScientificName = inValue;
466      return this;
467   }
468
469   //--------------------------------------------------------------------------
470   public String getCommonName()
471   {
472      if (! mInitialized)
473      {
474         init();
475      }
476
477      return mCommonName;
478   }
479
480   //--------------------------------------------------------------------------
481   public NCBITaxon setCommonName(String inValue)
482   {
483      mCommonName = inValue;
484      return this;
485   }
486
487
488   //--------------------------------------------------------------------------
489   public String getGenBankCommonName()
490   {
491      if (! mInitialized)
492      {
493         init();
494      }
495
496      return mGenBankCommonName;
497   }
498
499   //--------------------------------------------------------------------------
500   public NCBITaxon setGenBankCommonName(String inValue)
501   {
502      mGenBankCommonName = inValue;
503      return this;
504   }
505
506
507   //--------------------------------------------------------------------------
508   public Set<String> getSynonyms()
509   {
510      if (! mInitialized)
511      {
512         init();
513      }
514
515      return mSynonyms;
516   }
517
518   //--------------------------------------------------------------------------
519   public NCBITaxon setSynonyms(Collection<String> inValues)
520   {
521      mSynonyms = inValues != null ? new HashSet<>(inValues) : null;
522      return this;
523   }
524
525   //--------------------------------------------------------------------------
526   public NCBITaxon addSynonym(String inValue)
527   {
528      if (null == mSynonyms)
529      {
530         mSynonyms = new HashSet<>(2);
531      }
532
533      mSynonyms.add(inValue);
534      return this;
535   }
536
537   
538   //--------------------------------------------------------------------------
539   public NCBITaxonNodeRank getTaxonomyRank()
540   {
541      if (! mInitialized)
542      {
543         init();
544      }
545
546      return mNodeRank;
547   }
548
549   //--------------------------------------------------------------------------
550   public NCBITaxon setTaxonomyRank(NCBITaxonNodeRank inValue)
551   {
552      mNodeRank = inValue;
553      return this;
554   }
555
556
557   //--------------------------------------------------------------------------
558   public String getEMBL_Code()
559   {
560      if (! mInitialized)
561      {
562         init();
563      }
564
565      return mEMBL_Code;
566   }
567
568   //--------------------------------------------------------------------------
569   public NCBITaxon setEMBL_Code(String inValue)
570   {
571      mEMBL_Code = inValue;
572      return this;
573   }
574
575
576   //--------------------------------------------------------------------------
577   public NCBIGenBankDivision getDivision()
578   {
579      if (! mInitialized)
580      {
581         init();
582      }
583
584      return mDivision;
585   }
586
587   //--------------------------------------------------------------------------
588   public NCBITaxon setDivision(NCBIGenBankDivision inValue)
589   {
590      mDivision = inValue;
591      return this;
592   }
593
594
595   //--------------------------------------------------------------------------
596   public boolean getInheritedDivisionFlag()
597   {
598      if (! mInitialized)
599      {
600         init();
601      }
602
603      return mInheritedDivisionFlag;
604   }
605
606   //--------------------------------------------------------------------------
607   public NCBITaxon setInheritedDivisionFlag(boolean inValue)
608   {
609      mInheritedDivisionFlag = inValue;
610      return this;
611   }
612
613
614   //--------------------------------------------------------------------------
615   public NCBIGeneticCode getGeneticCode()
616   {
617      if (! mInitialized)
618      {
619         init();
620      }
621
622      return mGeneticCode;
623   }
624
625   //--------------------------------------------------------------------------
626   public NCBITaxon setGeneticCode(NCBIGeneticCode inValue)
627   {
628      mGeneticCode = inValue;
629      return this;
630   }
631
632
633   //--------------------------------------------------------------------------
634   public boolean getInheritedGeneticCodeFlag()
635   {
636      if (! mInitialized)
637      {
638         init();
639      }
640
641      return mInheritedGeneticCodeFlag;
642   }
643
644   //--------------------------------------------------------------------------
645   public NCBITaxon setInheritedGeneticCodeFlag(boolean inValue)
646   {
647      mInheritedGeneticCodeFlag = inValue;
648      return this;
649   }
650
651
652   //--------------------------------------------------------------------------
653   public NCBIGeneticCode getMitochondrialGeneticCode()
654   {
655      if (! mInitialized)
656      {
657         init();
658      }
659
660      return mMitochondrialGeneticCode;
661   }
662
663   //--------------------------------------------------------------------------
664   public NCBITaxon setMitochondrialGeneticCode(NCBIGeneticCode inValue)
665   {
666      mMitochondrialGeneticCode = inValue;
667      return this;
668   }
669
670
671   //--------------------------------------------------------------------------
672   public boolean getInheritedMitochondrialGeneticCodeFlag()
673   {
674      if (! mInitialized)
675      {
676         init();
677      }
678
679      return mInheritedMitochondrialGeneticCodeFlag;
680   }
681
682   //--------------------------------------------------------------------------
683   public NCBITaxon setInheritedMitochondrialGeneticCodeFlag(boolean inValue)
684   {
685      mInheritedMitochondrialGeneticCodeFlag = inValue;
686      return this;
687   }
688
689
690   //--------------------------------------------------------------------------
691   public boolean getGenBankHiddenFlag()
692   {
693      if (! mInitialized)
694      {
695         init();
696      }
697
698      return mGenBankHiddenFlag;
699   }
700
701   //--------------------------------------------------------------------------
702   public NCBITaxon setGenBankHiddenFlag(boolean inValue)
703   {
704      mGenBankHiddenFlag = inValue;
705      return this;
706   }
707
708
709   //--------------------------------------------------------------------------
710   public boolean getHiddenSubtreeRootFlag()
711   {
712      if (! mInitialized)
713      {
714         init();
715      }
716
717      return mHiddenSubtreeRootFlag;
718   }
719
720   //--------------------------------------------------------------------------
721   public NCBITaxon setHiddenSubtreeRootFlag(boolean inValue)
722   {
723      mHiddenSubtreeRootFlag = inValue;
724      return this;
725   }
726
727
728   //--------------------------------------------------------------------------
729   public String getComments()
730   {
731      if (! mInitialized)
732      {
733         init();
734      }
735
736      return mComments;
737   }
738
739   //--------------------------------------------------------------------------
740   public NCBITaxon setComments(String inValue)
741   {
742      mComments = inValue;
743      return this;
744   }
745
746
747   //--------------------------------------------------------------------------
748   @Override
749   public boolean equals(Object inObj)
750   {
751      boolean result = false;
752
753      if (inObj != null)
754      {
755         if (this == inObj
756             || (inObj instanceof NCBITaxon
757                 && mTaxonId == ((NCBITaxon) inObj).mTaxonId))
758         {
759            result = true;
760         }
761      }
762
763      return result;
764   }
765
766   //--------------------------------------------------------------------------
767   @Override
768   public int hashCode()
769   {
770      return mTaxonId;
771   }
772
773   //--------------------------------------------------------------------------
774   public int compareTo(NCBITaxon inObj)
775   {
776      int result = 0;
777
778      if (inObj != null)
779      {
780         NCBIGenBankDivision division = getDivision();
781         if (division != null
782             && division.name() != null)
783         {
784            NCBIGenBankDivision division2 = inObj.getDivision();
785            if (division2 != null
786                && division2.name() != null)
787            {
788               result = division.name().compareTo(division2.name());
789            }
790            else
791            {
792               result = 1;
793            }
794         }
795         else
796         {
797            result = -1;
798         }
799      }
800      else
801      {
802         result = 1;
803      }
804
805      if (0 == result)
806      {
807         if (mTaxonId > inObj.mTaxonId)
808         {
809            result = 1;
810         }
811         else if (mTaxonId < inObj.mTaxonId)
812         {
813            result = -1;
814         }
815      }
816
817      return result;
818   }
819
820   //--------------------------------------------------------------------------
821   /**
822    Creates a subset of the nodes file containing just the specified taxon ids (and their parent taxon ids).
823    @param inTaxonIds the list of id to extract from the taxonomy data
824    @param inDestFile the nodes file to which the extracted taxon data should be written
825    */
826   public static void exportNodesFile(Set<Integer> inTaxonIds, File inDestFile)
827   {
828      List<Integer> orderedTaxonIds = new ArrayList<>(inTaxonIds);
829      Collections.sort(orderedTaxonIds);
830
831      Writer fileWriter = null;
832      try
833      {
834         try
835         {
836            fileWriter = new FileWriter(inDestFile);
837
838            String delimiter = "\t|\t";
839            StringBuilderPlus lineBuffer = new StringBuilderPlus().setDelimiter(delimiter);
840            for (Integer taxonId : orderedTaxonIds)
841            {
842               NCBITaxon taxon = getByTaxonId(taxonId);
843
844               lineBuffer.setLength(0);
845               lineBuffer.delimitedAppend(taxon.getTaxonId())
846                     .delimitedAppend(taxon.getParentTaxonId())
847                     .delimitedAppend(taxon.getTaxonomyRank())
848                     .delimitedAppend(taxon.getEMBL_Code())
849                     .delimitedAppend(taxon.getDivision().getId())
850                     .delimitedAppend(taxon.getInheritedDivisionFlag() ? 1 : 0)
851                     .delimitedAppend(taxon.getGeneticCode().getId())
852                     .delimitedAppend(taxon.getInheritedGeneticCodeFlag() ? 1 : 0)
853                     .delimitedAppend(taxon.getMitochondrialGeneticCode().getId())
854                     .delimitedAppend(taxon.getInheritedMitochondrialGeneticCodeFlag() ? 1 : 0)
855                     .delimitedAppend(taxon.getGenBankHiddenFlag() ? 1 : 0)
856                     .delimitedAppend(taxon.getHiddenSubtreeRootFlag() ? 1 : 0)
857                     .delimitedAppend(StringUtil.isSet(taxon.getComments()) ? taxon.getComments() : "");
858
859               fileWriter.write(lineBuffer.toString());
860               fileWriter.write(NL);
861            }
862         }
863         finally
864         {
865            if (fileWriter != null) fileWriter.close();
866         }
867      }
868      catch (IOException e)
869      {
870         throw new RuntimeException("Error parsing node file.", e);
871      }
872   }
873
874   //--------------------------------------------------------------------------
875   /**
876    Creates a subset of the names file containing just the specified taxon ids (and their parent taxon ids).
877    @param inTaxonIds the list of id to extract from the taxonomy data
878    @param inDestFile the names file to which the extracted taxon data should be written
879    */
880   public static void exportNamesFile(Set<Integer> inTaxonIds, File inDestFile)
881      throws IOException
882   {
883      List<Integer> orderedTaxonIds = new ArrayList<>(inTaxonIds);
884      Collections.sort(orderedTaxonIds);
885
886      Writer fileWriter = null;
887
888      try
889      {
890         fileWriter = new FileWriter(inDestFile);
891
892         StringBuilderPlus lineBuffer = new StringBuilderPlus().setDelimiter("\t|\t");
893         for (Integer taxonId : orderedTaxonIds)
894         {
895            NCBITaxon taxon = getByTaxonId(taxonId);
896
897            if (StringUtil.isSet(taxon.getScientificName()))
898            {
899               lineBuffer.setLength(0);
900               lineBuffer.delimitedAppend(taxon.getTaxonId())
901                     .delimitedAppend(taxon.getScientificName())
902                     .delimitedAppend("\t")   // EMBL name
903                     .delimitedAppend(NCBITaxonNameClass.SCIENTIFIC_NAME);
904
905               fileWriter.write(lineBuffer.toString());
906               fileWriter.write(NL);
907            }
908
909            if (StringUtil.isSet(taxon.getCommonName()))
910            {
911               lineBuffer.setLength(0);
912               lineBuffer.delimitedAppend(taxon.getTaxonId())
913                     .delimitedAppend(taxon.getCommonName())
914                     .delimitedAppend("\t")   // EMBL name
915                     .delimitedAppend(NCBITaxonNameClass.COMMON_NAME);
916
917               fileWriter.write(lineBuffer.toString());
918               fileWriter.write(NL);
919            }
920
921            if (StringUtil.isSet(taxon.getGenBankCommonName()))
922            {
923               lineBuffer.setLength(0);
924               lineBuffer.delimitedAppend(taxon.getTaxonId())
925                     .delimitedAppend(taxon.getGenBankCommonName())
926                     .delimitedAppend("\t")   // EMBL name
927                     .delimitedAppend(NCBITaxonNameClass.GENBANK_COMMON_NAME);
928
929               fileWriter.write(lineBuffer.toString());
930               fileWriter.write(NL);
931            }
932
933            if (CollectionUtil.hasValues(taxon.getSynonyms()))
934            {
935               for (String synonym : taxon.getSynonyms())
936               {
937                  lineBuffer.setLength(0);
938                  lineBuffer.delimitedAppend(taxon.getTaxonId())
939                        .delimitedAppend(synonym)
940                        .delimitedAppend("\t")   // EMBL name
941                        .delimitedAppend(NCBITaxonNameClass.SYNONYM);
942
943                  fileWriter.write(lineBuffer.toString());
944                  fileWriter.write(NL);
945               }
946            }
947         }
948      }
949      finally
950      {
951         if (fileWriter != null) fileWriter.close();
952      }
953
954   }
955
956   //**************************************************************************
957   // PRIVATE METHODS
958   //**************************************************************************
959
960   //--------------------------------------------------------------------------
961   private static void resetCommonTaxons()
962   {
963      for (NCBITaxon taxon : sCommonSet)
964      {
965         taxon.clearData();
966      }
967   }
968
969   //--------------------------------------------------------------------------
970   private void clearData()
971   {
972      mInitialized = false;
973   }
974
975   //--------------------------------------------------------------------------
976   private void init()
977   {
978      NCBITaxon template = getByTaxonId(mTaxonId);
979      if (template != null)
980      {
981         setParentTaxonId(template.getParentTaxonId());
982         setScientificName(template.getScientificName());
983         setCommonName(template.getCommonName());
984         setGenBankCommonName(template.getGenBankCommonName());
985         setSynonyms(template.getSynonyms());
986         setTaxonomyRank(template.getTaxonomyRank());
987         setDivision(template.getDivision());
988         setEMBL_Code(template.getEMBL_Code());
989         setGeneticCode(template.getGeneticCode());
990         setInheritedGeneticCodeFlag(template.getInheritedGeneticCodeFlag());
991         setMitochondrialGeneticCode(template.getMitochondrialGeneticCode());
992         setInheritedMitochondrialGeneticCodeFlag(template.getInheritedMitochondrialGeneticCodeFlag());
993         setGenBankHiddenFlag(template.getGenBankHiddenFlag());
994         setHiddenSubtreeRootFlag(template.getHiddenSubtreeRootFlag());
995         setComments(template.getComments());
996      }
997
998      mInitialized = true;
999   }
1000
1001   //###########################################################################
1002   // INNER CLASS
1003   //###########################################################################
1004
1005   private abstract static class InternalDataSource extends NCBITaxonomyDataSourceImpl
1006   {
1007      private String mNodesRsrc;
1008      private String mNamesRsrc;
1009
1010      //-----------------------------------------------------------------------
1011      public InternalDataSource(String inNodesRsrc, String inNamesRsrc)
1012      {
1013         mNodesRsrc = inNodesRsrc;
1014         mNamesRsrc = inNamesRsrc;
1015      }
1016
1017
1018      //--------------------------------------------------------------------------
1019      protected synchronized void initialize()
1020      {
1021         parseNodesFile();
1022         parseNamesFile();
1023         // TODO: Trim the maps to conserve space?
1024      }
1025
1026      //--------------------------------------------------------------------------
1027      private void parseNodesFile()
1028      {
1029         try
1030         {
1031            BufferedReader nodeReader = null;
1032
1033            try
1034            {
1035               nodeReader = getNodesReader();
1036
1037               innerParseNodesFile(nodeReader);
1038            }
1039            finally
1040            {
1041               if (nodeReader != null) nodeReader.close();
1042            }
1043         }
1044         catch (IOException e)
1045         {
1046            throw new RuntimeException("Error parsing node file.", e);
1047         }
1048      }
1049
1050      //--------------------------------------------------------------------------
1051      private void parseNamesFile()
1052      {
1053         try
1054         {
1055            BufferedReader namesReader = null;
1056
1057            try
1058            {
1059               namesReader = getNamesReader();
1060
1061               innerParseNamesFile(namesReader);
1062            }
1063            finally
1064            {
1065               if (namesReader != null) namesReader.close();
1066            }
1067         }
1068         catch (IOException e)
1069         {
1070            throw new RuntimeException("Error parsing node file.", e);
1071         }
1072      }
1073
1074
1075      //--------------------------------------------------------------------------
1076      private BufferedReader getNodesReader()
1077      throws IOException
1078      {
1079         LOGGER.log(Level.FINE, "Initializing NCBI taxon data source from nodes file "
1080                                + StringUtil.singleQuote(mNodesRsrc));
1081         InputStream stream = getResourceStream(mNodesRsrc);
1082
1083         return new BufferedReader(new InputStreamReader(stream), 1024 * 8);
1084      }
1085
1086      //--------------------------------------------------------------------------
1087      private BufferedReader getNamesReader()
1088      throws IOException
1089      {
1090         InputStream stream = getResourceStream(mNamesRsrc);
1091
1092         return new BufferedReader(new InputStreamReader(stream), 1024 * 8);
1093      }
1094
1095      //--------------------------------------------------------------------------
1096      private static InputStream getResourceStream(String inResource)
1097      throws IOException
1098      {
1099         InputStream stream = NCBITaxon.class.getResourceAsStream(inResource);
1100         if (null == stream)
1101         {
1102            throw new RuntimeException("'" + inResource + "' couldn't be found!");
1103         }
1104
1105         if (inResource.endsWith(".gz"))
1106         {
1107            stream = new GZIPInputStream(stream);
1108         }
1109
1110         return stream;
1111      }
1112   }
1113
1114   //###########################################################################
1115   // INNER CLASS
1116   //###########################################################################
1117
1118   private static class CommonInternalDataSource extends InternalDataSource
1119   {
1120      //-----------------------------------------------------------------------
1121      public CommonInternalDataSource()
1122      {
1123         super(COMMON_NODES_FILE, COMMON_NAMES_FILE);
1124      }
1125   }
1126   //###########################################################################
1127   // INNER CLASS
1128   //###########################################################################
1129
1130   private static class FullInternalDataSource extends InternalDataSource
1131   {
1132      //-----------------------------------------------------------------------
1133      public FullInternalDataSource()
1134      {
1135         super(NODES_FILE, NAMES_FILE);
1136      }
1137   }
1138}