001package com.hfg.bio.taxonomy.ncbi; 002 003import java.io.*; 004import java.util.*; 005import java.util.logging.Level; 006import java.util.logging.Logger; 007import java.util.zip.GZIPInputStream; 008 009import com.hfg.util.StringBuilderPlus; 010import com.hfg.util.StringUtil; 011import com.hfg.util.collection.CollectionUtil; 012 013//------------------------------------------------------------------------------ 014/** 015 * Species class based on the NCBI taxonomy data. 016 * A default set of files are included as data sources. To minimize load time and 017 * memory the initial default data source has a few common values. If a value is 018 * requested that isn't found in this set, a second, more complete but not fully 019 * up-to-date data source is loaded and the lightweight data source is discarded. 020 * An additional more up-to-date data source can be manually added as an 021 * NCBIRemoteTaxonomyDataSource. 022 * <div> 023 * @author J. Alex Taylor, hairyfatguy.com 024 * </div> 025 */ 026//------------------------------------------------------------------------------ 027// com.hfg XML/HTML Coding Library 028// 029// This library is free software; you can redistribute it and/or 030// modify it under the terms of the GNU Lesser General Public 031// License as published by the Free Software Foundation; either 032// version 2.1 of the License, or (at your option) any later version. 033// 034// This library is distributed in the hope that it will be useful, 035// but WITHOUT ANY WARRANTY; without even the implied warranty of 036// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 037// Lesser General Public License for more details. 038// 039// You should have received a copy of the GNU Lesser General Public 040// License along with this library; if not, write to the Free Software 041// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 042// 043// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 044// jataylor@hairyfatguy.com 045//------------------------------------------------------------------------------ 046 047public class NCBITaxon implements Comparable<NCBITaxon> 048{ 049 private static final Logger LOGGER = Logger.getLogger(NCBITaxon.class.getPackage().getName()); 050 051 // Data sources should be arranged such that smaller/lighter sources are tried first 052 // and only if a match is not found do we move to the next "heavier" implementation. 053 // Two data sources are provided by default - the first with a few common values and 054 // the second which is a mostly complete (but old) taxonomy. If an up-to-date taxonomy 055 // is desired, an NCBIRemoteTaxonomyDataSource should be added as a data source. 056 private static final List<NCBITaxonomyDataSource> mDataSources = new ArrayList<>(3); 057 058 static 059 { 060 addDataSource(new CommonInternalDataSource()); 061 addDataSource(new FullInternalDataSource()); 062 } 063 064 // Shortcuts to some common organisms 065 /** Human */ 066 public static final NCBITaxon HOMO_SAPIENS = new NCBITaxon(9606); 067 /** Mouse */ 068 public static final NCBITaxon MUS_MUSCULUS = new NCBITaxon(10090); 069 /** Rat */ 070 public static final NCBITaxon RATTUS_NORVEGICUS = new NCBITaxon(10116); 071 /** Hamster */ 072 public static final NCBITaxon CRICETULUS_GRISEUS = new NCBITaxon(10029); 073 /** Rabbit */ 074 public static final NCBITaxon ORYCTOLAGUS_CUNICULUS = new NCBITaxon(9986); 075 /** Cow */ 076 public static final NCBITaxon BOS_TAURUS = new NCBITaxon(9913); 077 /** Horse */ 078 public static final NCBITaxon EQUUS_CABALLUS = new NCBITaxon(9796); 079 /** Pig */ 080 public static final NCBITaxon SUS_SCROFA = new NCBITaxon(9823); 081 /** Sheep */ 082 public static final NCBITaxon OVIS_ARIES = new NCBITaxon(9940); 083 /** Xenopus (African clawed frog) */ 084 public static final NCBITaxon XENOPUS_LAEVIS = new NCBITaxon(8355); 085 /** Drosophila (Fruit fly) */ 086 public static final NCBITaxon DROSOPHILA_MELANOGASTER = new NCBITaxon(7227); 087 /** E. Coli */ 088 public static final NCBITaxon ESCHERICHIA_COLI = new NCBITaxon(562); 089 /** Yeast */ 090 public static final NCBITaxon SACCHAROMYCES_CEREVISIAE = new NCBITaxon(4932); 091 /** Dog */ 092 public static final NCBITaxon CANIS_FAMILIARIS = new NCBITaxon(9615); 093 /** Chimpanzee */ 094 public static final NCBITaxon PAN_TROGLODYTES = new NCBITaxon(9598); 095 /** Rhesus monkey */ 096 public static final NCBITaxon MACACA_MULATTA = new NCBITaxon(9544); 097 /** Camel */ 098 public static final NCBITaxon CAMELUS_BACTRIANUS = new NCBITaxon(9837); 099 /** Dromedary */ 100 public static final NCBITaxon CAMELUS_DROMEDARIUS = new NCBITaxon(9838); 101 /** Llama */ 102 public static final NCBITaxon LAMA_GLAMA = new NCBITaxon(9844); 103 /** Alpaca */ 104 public static final NCBITaxon VICUGNA_PACOS = new NCBITaxon(30538); 105 106 // If you add to these common defs, add to sCommonSet below and regenerate the short dump files. 107 108 /** Mammals */ 109 public static final NCBITaxon MAMMALS = new NCBITaxon(40674); 110 /** Primates */ 111 public static final NCBITaxon PRIMATES = new NCBITaxon(9443); 112 /** Rodents */ 113 public static final NCBITaxon RODENTS = new NCBITaxon(9989); 114 115 /** Unknown / unidentified */ 116 public static final NCBITaxon UNKNOWN = new NCBITaxon(32644); 117 118 /** Synthetic construct / artificial sequence */ 119 public static final NCBITaxon SYNTHETIC_CONSTRUCT = new NCBITaxon(32630); 120 121 //************************************************************************** 122 // PRIVATE FIELDS 123 //************************************************************************** 124 125 private boolean mInitialized; 126 private int mTaxonId; 127 private int mParentTaxonId; 128 private String mScientificName; 129 private String mCommonName; 130 private String mGenBankCommonName; 131 private Set<String> mSynonyms; 132 private NCBITaxonNodeRank mNodeRank = NCBITaxonNodeRank.NO_RANK; 133 private String mEMBL_Code; 134 private NCBIGenBankDivision mDivision; 135 private Boolean mInheritedDivisionFlag; 136 private NCBIGeneticCode mGeneticCode; 137 private Boolean mInheritedGeneticCodeFlag; 138 private NCBIGeneticCode mMitochondrialGeneticCode; 139 private Boolean mInheritedMitochondrialGeneticCodeFlag; 140 private Boolean mGenBankHiddenFlag; 141 private Boolean mHiddenSubtreeRootFlag; 142 private String mComments; 143 144 145 private static final Set<NCBITaxon> sCommonSet = new HashSet<>(); 146 147 private static final String NODES_FILE = "rsrc/nodes.dmp.gz"; 148 private static final String NAMES_FILE = "rsrc/names.dmp.gz"; 149 private static final String COMMON_NODES_FILE = "rsrc/nodes_short.dmp.gz"; 150 private static final String COMMON_NAMES_FILE = "rsrc/names_short.dmp.gz"; 151 152 private static final String NL = System.getProperty("line.separator"); 153 154 static 155 { 156 sCommonSet.add(HOMO_SAPIENS); 157 sCommonSet.add(MUS_MUSCULUS); 158 sCommonSet.add(RATTUS_NORVEGICUS); 159 sCommonSet.add(CRICETULUS_GRISEUS); 160 sCommonSet.add(ORYCTOLAGUS_CUNICULUS); 161 sCommonSet.add(BOS_TAURUS); 162 sCommonSet.add(EQUUS_CABALLUS); 163 sCommonSet.add(SUS_SCROFA); 164 sCommonSet.add(OVIS_ARIES); 165 sCommonSet.add(DROSOPHILA_MELANOGASTER); 166 sCommonSet.add(ESCHERICHIA_COLI); 167 sCommonSet.add(XENOPUS_LAEVIS); 168 sCommonSet.add(SACCHAROMYCES_CEREVISIAE); 169 sCommonSet.add(CANIS_FAMILIARIS); 170 sCommonSet.add(PAN_TROGLODYTES); 171 sCommonSet.add(MACACA_MULATTA); 172 sCommonSet.add(CAMELUS_BACTRIANUS); 173 sCommonSet.add(CAMELUS_DROMEDARIUS); 174 sCommonSet.add(LAMA_GLAMA); 175 sCommonSet.add(VICUGNA_PACOS); 176 sCommonSet.add(MAMMALS); 177 sCommonSet.add(PRIMATES); 178 sCommonSet.add(RODENTS); 179 sCommonSet.add(SYNTHETIC_CONSTRUCT); 180 sCommonSet.add(UNKNOWN); 181 } 182 183 184 //************************************************************************** 185 // CONSTRUCTORS 186 //************************************************************************** 187 188 //-------------------------------------------------------------------------- 189 public NCBITaxon(int inTaxonId) 190 { 191 mTaxonId = inTaxonId; 192 } 193 194 //************************************************************************** 195 // PUBLIC FUNCTIONS 196 //************************************************************************** 197 198 //--------------------------------------------------------------------------- 199 public static Logger getLogger() 200 { 201 return LOGGER; 202 } 203 204 //-------------------------------------------------------------------------- 205 public static void addDataSource(NCBITaxonomyDataSource inValue) 206 { 207 mDataSources.add(inValue); 208 } 209 210 //-------------------------------------------------------------------------- 211 public static void addDataSource(int inIndex, NCBITaxonomyDataSource inValue) 212 { 213 mDataSources.add(inIndex, inValue); 214 } 215 216 //-------------------------------------------------------------------------- 217 public static void setDataSource(NCBITaxonomyDataSource inValue) 218 { 219 mDataSources.clear(); 220 mDataSources.add(inValue); 221 } 222 223 //-------------------------------------------------------------------------- 224 /** 225 * Retrieves the NCBITaxon for the specified common name, scientific name, 226 * or GenBank common name. Generally there will be a single taxon found for a given 227 * name, but there are instances where multiple taxons may be found. 228 @param inValue the species name (common or scientific) for the taxon object to return 229 @return a Set of taxon objects corresponding to the specified name. Returns null if a match cannot be found. 230 */ 231 public static synchronized Set<NCBITaxon> getByName(String inValue) 232 { 233 Set<NCBITaxon> taxons = null; 234 if (StringUtil.isSet(inValue)) 235 { 236 // Lowercase the value so we can compare the names case-insensitively. 237 inValue = inValue.toLowerCase(); 238 239 for (int i = 0; i < mDataSources.size(); i++) 240 { 241 NCBITaxonomyDataSource dataSource = mDataSources.get(i); 242 243 taxons = dataSource.getByName(inValue); 244 if (null == taxons 245 & i < mDataSources.size() - 1) 246 { 247 // The requested id wasn't found in the data source. 248 // Data sources should be provided in increasing size so 249 // if this isn't the last data source, jetison it and move 250 // to the next one. 251 mDataSources.remove(i--); 252 } 253 else 254 { 255 break; 256 } 257 } 258 } 259 260 if (taxons != null) 261 { 262 for (NCBITaxon taxon : taxons) 263 { 264 taxon.mInitialized = true; 265 } 266 } 267 268 return taxons; 269 } 270 271 //-------------------------------------------------------------------------- 272 /** 273 Returns the taxon for the specified NCBI taxon id. 274 @param inValue the taxon id for the taxon object to retrieve 275 @return the taxon object corresponding to the specified id 276 */ 277 public static synchronized NCBITaxon getByTaxonId(int inValue) 278 { 279 NCBITaxon taxon = null; 280 281 for (int i = 0; i < mDataSources.size(); i++) 282 { 283 NCBITaxonomyDataSource dataSource = mDataSources.get(i); 284 285 taxon = dataSource.getByTaxonId(inValue); 286 if (null == taxon 287 & i < mDataSources.size() - 1) 288 { 289 // The requested id wasn't found in the data source. 290 // Data sources should be provided in increasing size so 291 // if this isn't the last data source, jetison it and move 292 // to the next one. 293 mDataSources.remove(i--); 294 resetCommonTaxons(); 295 } 296 else 297 { 298 break; 299 } 300 } 301 302 if (taxon != null) 303 { 304 taxon.mInitialized = true; 305 } 306 307 return taxon; 308 } 309 310 //-------------------------------------------------------------------------- 311 /** 312 Returns an unmodifiable Collection of the common taxons (those defined as class constants). 313 @return the small collection of frequently used taxon objects 314 */ 315 public static Collection<NCBITaxon> getCommonSet() 316 { 317 return Collections.unmodifiableCollection(sCommonSet); 318 } 319 320 //-------------------------------------------------------------------------- 321 @Override 322 public String toString() 323 { 324 StringBuilder buffer = new StringBuilder(); 325 buffer.append(mTaxonId); 326 buffer.append(" "); 327 328 buffer.append(mScientificName); 329 330 if (mGenBankCommonName != null) 331 { 332 buffer.append(" ("); 333 buffer.append(mGenBankCommonName); 334 buffer.append(")"); 335 } 336 337 return buffer.toString(); 338 } 339 340 //-------------------------------------------------------------------------- 341 public int getTaxonId() 342 { 343 return mTaxonId; 344 } 345 346 //-------------------------------------------------------------------------- 347 public String getFullTaxonomy() 348 { 349 if (! mInitialized) 350 { 351 init(); 352 } 353 354 StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter("; "); 355 356 if (mParentTaxonId != 1) 357 { 358 NCBITaxon parentTaxon = getParentTaxon(); 359 buffer.append(parentTaxon.getFullTaxonomy()); 360 } 361 362 if (getTaxonomyRank() != NCBITaxonNodeRank.NO_RANK) 363 { 364 buffer.delimitedAppend(getTaxonomyRank()); 365 buffer.append(" "); 366 buffer.append(getScientificName()); 367 } 368 369 return buffer.toString(); 370 } 371 372 //-------------------------------------------------------------------------- 373 public boolean isSubtaxonOf(NCBITaxon inTaxon2) 374 { 375 boolean result = false; 376 NCBITaxon currentTaxon = this; 377 378 while (currentTaxon != null 379 && currentTaxon.getTaxonId() != 1) 380 { 381 currentTaxon = currentTaxon.getParentTaxon(); 382 383 if (currentTaxon != null 384 && currentTaxon.equals(inTaxon2)) 385 { 386 result = true; 387 break; 388 } 389 } 390 391 return result; 392 } 393 394 //-------------------------------------------------------------------------- 395 public NCBITaxon getFirstCommonTaxon(NCBITaxon inTaxon2) 396 { 397 NCBITaxon firstCommonTaxon = null; 398 399 Set<NCBITaxon> taxonSet = new HashSet<>(); 400 401 NCBITaxon currentTaxon = this; 402 while (currentTaxon != null) 403 { 404 taxonSet.add(currentTaxon); 405 currentTaxon = currentTaxon.getParentTaxon(); 406 } 407 408 409 // Now walk up the 2nd taxon's branch until we find a taxon in common. 410 currentTaxon = inTaxon2; 411 while (currentTaxon != null) 412 { 413 if (taxonSet.contains(currentTaxon)) 414 { 415 firstCommonTaxon = currentTaxon; 416 break; 417 } 418 419 currentTaxon = currentTaxon.getParentTaxon(); 420 } 421 422 return firstCommonTaxon; 423 } 424 425 //-------------------------------------------------------------------------- 426 public NCBITaxon getParentTaxon() 427 { 428 return getByTaxonId(getParentTaxonId()); 429 } 430 431 //-------------------------------------------------------------------------- 432 public NCBITaxon setParentTaxonId(int inValue) 433 { 434 // It can't be its own parent. 435 if (inValue != mTaxonId) mParentTaxonId = inValue; 436 return this; 437 } 438 439 440 //-------------------------------------------------------------------------- 441 public int getParentTaxonId() 442 { 443 if (! mInitialized) 444 { 445 init(); 446 } 447 448 return mParentTaxonId; 449 } 450 451 //-------------------------------------------------------------------------- 452 public String getScientificName() 453 { 454 if (! mInitialized) 455 { 456 init(); 457 } 458 459 return mScientificName; 460 } 461 462 //-------------------------------------------------------------------------- 463 public NCBITaxon setScientificName(String inValue) 464 { 465 mScientificName = inValue; 466 return this; 467 } 468 469 //-------------------------------------------------------------------------- 470 public String getCommonName() 471 { 472 if (! mInitialized) 473 { 474 init(); 475 } 476 477 return mCommonName; 478 } 479 480 //-------------------------------------------------------------------------- 481 public NCBITaxon setCommonName(String inValue) 482 { 483 mCommonName = inValue; 484 return this; 485 } 486 487 488 //-------------------------------------------------------------------------- 489 public String getGenBankCommonName() 490 { 491 if (! mInitialized) 492 { 493 init(); 494 } 495 496 return mGenBankCommonName; 497 } 498 499 //-------------------------------------------------------------------------- 500 public NCBITaxon setGenBankCommonName(String inValue) 501 { 502 mGenBankCommonName = inValue; 503 return this; 504 } 505 506 507 //-------------------------------------------------------------------------- 508 public Set<String> getSynonyms() 509 { 510 if (! mInitialized) 511 { 512 init(); 513 } 514 515 return mSynonyms; 516 } 517 518 //-------------------------------------------------------------------------- 519 public NCBITaxon setSynonyms(Collection<String> inValues) 520 { 521 mSynonyms = inValues != null ? new HashSet<>(inValues) : null; 522 return this; 523 } 524 525 //-------------------------------------------------------------------------- 526 public NCBITaxon addSynonym(String inValue) 527 { 528 if (null == mSynonyms) 529 { 530 mSynonyms = new HashSet<>(2); 531 } 532 533 mSynonyms.add(inValue); 534 return this; 535 } 536 537 538 //-------------------------------------------------------------------------- 539 public NCBITaxonNodeRank getTaxonomyRank() 540 { 541 if (! mInitialized) 542 { 543 init(); 544 } 545 546 return mNodeRank; 547 } 548 549 //-------------------------------------------------------------------------- 550 public NCBITaxon setTaxonomyRank(NCBITaxonNodeRank inValue) 551 { 552 mNodeRank = inValue; 553 return this; 554 } 555 556 557 //-------------------------------------------------------------------------- 558 public String getEMBL_Code() 559 { 560 if (! mInitialized) 561 { 562 init(); 563 } 564 565 return mEMBL_Code; 566 } 567 568 //-------------------------------------------------------------------------- 569 public NCBITaxon setEMBL_Code(String inValue) 570 { 571 mEMBL_Code = inValue; 572 return this; 573 } 574 575 576 //-------------------------------------------------------------------------- 577 public NCBIGenBankDivision getDivision() 578 { 579 if (! mInitialized) 580 { 581 init(); 582 } 583 584 return mDivision; 585 } 586 587 //-------------------------------------------------------------------------- 588 public NCBITaxon setDivision(NCBIGenBankDivision inValue) 589 { 590 mDivision = inValue; 591 return this; 592 } 593 594 595 //-------------------------------------------------------------------------- 596 public boolean getInheritedDivisionFlag() 597 { 598 if (! mInitialized) 599 { 600 init(); 601 } 602 603 return mInheritedDivisionFlag; 604 } 605 606 //-------------------------------------------------------------------------- 607 public NCBITaxon setInheritedDivisionFlag(boolean inValue) 608 { 609 mInheritedDivisionFlag = inValue; 610 return this; 611 } 612 613 614 //-------------------------------------------------------------------------- 615 public NCBIGeneticCode getGeneticCode() 616 { 617 if (! mInitialized) 618 { 619 init(); 620 } 621 622 return mGeneticCode; 623 } 624 625 //-------------------------------------------------------------------------- 626 public NCBITaxon setGeneticCode(NCBIGeneticCode inValue) 627 { 628 mGeneticCode = inValue; 629 return this; 630 } 631 632 633 //-------------------------------------------------------------------------- 634 public boolean getInheritedGeneticCodeFlag() 635 { 636 if (! mInitialized) 637 { 638 init(); 639 } 640 641 return mInheritedGeneticCodeFlag; 642 } 643 644 //-------------------------------------------------------------------------- 645 public NCBITaxon setInheritedGeneticCodeFlag(boolean inValue) 646 { 647 mInheritedGeneticCodeFlag = inValue; 648 return this; 649 } 650 651 652 //-------------------------------------------------------------------------- 653 public NCBIGeneticCode getMitochondrialGeneticCode() 654 { 655 if (! mInitialized) 656 { 657 init(); 658 } 659 660 return mMitochondrialGeneticCode; 661 } 662 663 //-------------------------------------------------------------------------- 664 public NCBITaxon setMitochondrialGeneticCode(NCBIGeneticCode inValue) 665 { 666 mMitochondrialGeneticCode = inValue; 667 return this; 668 } 669 670 671 //-------------------------------------------------------------------------- 672 public boolean getInheritedMitochondrialGeneticCodeFlag() 673 { 674 if (! mInitialized) 675 { 676 init(); 677 } 678 679 return mInheritedMitochondrialGeneticCodeFlag; 680 } 681 682 //-------------------------------------------------------------------------- 683 public NCBITaxon setInheritedMitochondrialGeneticCodeFlag(boolean inValue) 684 { 685 mInheritedMitochondrialGeneticCodeFlag = inValue; 686 return this; 687 } 688 689 690 //-------------------------------------------------------------------------- 691 public boolean getGenBankHiddenFlag() 692 { 693 if (! mInitialized) 694 { 695 init(); 696 } 697 698 return mGenBankHiddenFlag; 699 } 700 701 //-------------------------------------------------------------------------- 702 public NCBITaxon setGenBankHiddenFlag(boolean inValue) 703 { 704 mGenBankHiddenFlag = inValue; 705 return this; 706 } 707 708 709 //-------------------------------------------------------------------------- 710 public boolean getHiddenSubtreeRootFlag() 711 { 712 if (! mInitialized) 713 { 714 init(); 715 } 716 717 return mHiddenSubtreeRootFlag; 718 } 719 720 //-------------------------------------------------------------------------- 721 public NCBITaxon setHiddenSubtreeRootFlag(boolean inValue) 722 { 723 mHiddenSubtreeRootFlag = inValue; 724 return this; 725 } 726 727 728 //-------------------------------------------------------------------------- 729 public String getComments() 730 { 731 if (! mInitialized) 732 { 733 init(); 734 } 735 736 return mComments; 737 } 738 739 //-------------------------------------------------------------------------- 740 public NCBITaxon setComments(String inValue) 741 { 742 mComments = inValue; 743 return this; 744 } 745 746 747 //-------------------------------------------------------------------------- 748 @Override 749 public boolean equals(Object inObj) 750 { 751 boolean result = false; 752 753 if (inObj != null) 754 { 755 if (this == inObj 756 || (inObj instanceof NCBITaxon 757 && mTaxonId == ((NCBITaxon) inObj).mTaxonId)) 758 { 759 result = true; 760 } 761 } 762 763 return result; 764 } 765 766 //-------------------------------------------------------------------------- 767 @Override 768 public int hashCode() 769 { 770 return mTaxonId; 771 } 772 773 //-------------------------------------------------------------------------- 774 public int compareTo(NCBITaxon inObj) 775 { 776 int result = 0; 777 778 if (inObj != null) 779 { 780 NCBIGenBankDivision division = getDivision(); 781 if (division != null 782 && division.name() != null) 783 { 784 NCBIGenBankDivision division2 = inObj.getDivision(); 785 if (division2 != null 786 && division2.name() != null) 787 { 788 result = division.name().compareTo(division2.name()); 789 } 790 else 791 { 792 result = 1; 793 } 794 } 795 else 796 { 797 result = -1; 798 } 799 } 800 else 801 { 802 result = 1; 803 } 804 805 if (0 == result) 806 { 807 if (mTaxonId > inObj.mTaxonId) 808 { 809 result = 1; 810 } 811 else if (mTaxonId < inObj.mTaxonId) 812 { 813 result = -1; 814 } 815 } 816 817 return result; 818 } 819 820 //-------------------------------------------------------------------------- 821 /** 822 Creates a subset of the nodes file containing just the specified taxon ids (and their parent taxon ids). 823 @param inTaxonIds the list of id to extract from the taxonomy data 824 @param inDestFile the nodes file to which the extracted taxon data should be written 825 */ 826 public static void exportNodesFile(Set<Integer> inTaxonIds, File inDestFile) 827 { 828 List<Integer> orderedTaxonIds = new ArrayList<>(inTaxonIds); 829 Collections.sort(orderedTaxonIds); 830 831 Writer fileWriter = null; 832 try 833 { 834 try 835 { 836 fileWriter = new FileWriter(inDestFile); 837 838 String delimiter = "\t|\t"; 839 StringBuilderPlus lineBuffer = new StringBuilderPlus().setDelimiter(delimiter); 840 for (Integer taxonId : orderedTaxonIds) 841 { 842 NCBITaxon taxon = getByTaxonId(taxonId); 843 844 lineBuffer.setLength(0); 845 lineBuffer.delimitedAppend(taxon.getTaxonId()) 846 .delimitedAppend(taxon.getParentTaxonId()) 847 .delimitedAppend(taxon.getTaxonomyRank()) 848 .delimitedAppend(taxon.getEMBL_Code()) 849 .delimitedAppend(taxon.getDivision().getId()) 850 .delimitedAppend(taxon.getInheritedDivisionFlag() ? 1 : 0) 851 .delimitedAppend(taxon.getGeneticCode().getId()) 852 .delimitedAppend(taxon.getInheritedGeneticCodeFlag() ? 1 : 0) 853 .delimitedAppend(taxon.getMitochondrialGeneticCode().getId()) 854 .delimitedAppend(taxon.getInheritedMitochondrialGeneticCodeFlag() ? 1 : 0) 855 .delimitedAppend(taxon.getGenBankHiddenFlag() ? 1 : 0) 856 .delimitedAppend(taxon.getHiddenSubtreeRootFlag() ? 1 : 0) 857 .delimitedAppend(StringUtil.isSet(taxon.getComments()) ? taxon.getComments() : ""); 858 859 fileWriter.write(lineBuffer.toString()); 860 fileWriter.write(NL); 861 } 862 } 863 finally 864 { 865 if (fileWriter != null) fileWriter.close(); 866 } 867 } 868 catch (IOException e) 869 { 870 throw new RuntimeException("Error parsing node file.", e); 871 } 872 } 873 874 //-------------------------------------------------------------------------- 875 /** 876 Creates a subset of the names file containing just the specified taxon ids (and their parent taxon ids). 877 @param inTaxonIds the list of id to extract from the taxonomy data 878 @param inDestFile the names file to which the extracted taxon data should be written 879 */ 880 public static void exportNamesFile(Set<Integer> inTaxonIds, File inDestFile) 881 throws IOException 882 { 883 List<Integer> orderedTaxonIds = new ArrayList<>(inTaxonIds); 884 Collections.sort(orderedTaxonIds); 885 886 Writer fileWriter = null; 887 888 try 889 { 890 fileWriter = new FileWriter(inDestFile); 891 892 StringBuilderPlus lineBuffer = new StringBuilderPlus().setDelimiter("\t|\t"); 893 for (Integer taxonId : orderedTaxonIds) 894 { 895 NCBITaxon taxon = getByTaxonId(taxonId); 896 897 if (StringUtil.isSet(taxon.getScientificName())) 898 { 899 lineBuffer.setLength(0); 900 lineBuffer.delimitedAppend(taxon.getTaxonId()) 901 .delimitedAppend(taxon.getScientificName()) 902 .delimitedAppend("\t") // EMBL name 903 .delimitedAppend(NCBITaxonNameClass.SCIENTIFIC_NAME); 904 905 fileWriter.write(lineBuffer.toString()); 906 fileWriter.write(NL); 907 } 908 909 if (StringUtil.isSet(taxon.getCommonName())) 910 { 911 lineBuffer.setLength(0); 912 lineBuffer.delimitedAppend(taxon.getTaxonId()) 913 .delimitedAppend(taxon.getCommonName()) 914 .delimitedAppend("\t") // EMBL name 915 .delimitedAppend(NCBITaxonNameClass.COMMON_NAME); 916 917 fileWriter.write(lineBuffer.toString()); 918 fileWriter.write(NL); 919 } 920 921 if (StringUtil.isSet(taxon.getGenBankCommonName())) 922 { 923 lineBuffer.setLength(0); 924 lineBuffer.delimitedAppend(taxon.getTaxonId()) 925 .delimitedAppend(taxon.getGenBankCommonName()) 926 .delimitedAppend("\t") // EMBL name 927 .delimitedAppend(NCBITaxonNameClass.GENBANK_COMMON_NAME); 928 929 fileWriter.write(lineBuffer.toString()); 930 fileWriter.write(NL); 931 } 932 933 if (CollectionUtil.hasValues(taxon.getSynonyms())) 934 { 935 for (String synonym : taxon.getSynonyms()) 936 { 937 lineBuffer.setLength(0); 938 lineBuffer.delimitedAppend(taxon.getTaxonId()) 939 .delimitedAppend(synonym) 940 .delimitedAppend("\t") // EMBL name 941 .delimitedAppend(NCBITaxonNameClass.SYNONYM); 942 943 fileWriter.write(lineBuffer.toString()); 944 fileWriter.write(NL); 945 } 946 } 947 } 948 } 949 finally 950 { 951 if (fileWriter != null) fileWriter.close(); 952 } 953 954 } 955 956 //************************************************************************** 957 // PRIVATE METHODS 958 //************************************************************************** 959 960 //-------------------------------------------------------------------------- 961 private static void resetCommonTaxons() 962 { 963 for (NCBITaxon taxon : sCommonSet) 964 { 965 taxon.clearData(); 966 } 967 } 968 969 //-------------------------------------------------------------------------- 970 private void clearData() 971 { 972 mInitialized = false; 973 } 974 975 //-------------------------------------------------------------------------- 976 private void init() 977 { 978 NCBITaxon template = getByTaxonId(mTaxonId); 979 if (template != null) 980 { 981 setParentTaxonId(template.getParentTaxonId()); 982 setScientificName(template.getScientificName()); 983 setCommonName(template.getCommonName()); 984 setGenBankCommonName(template.getGenBankCommonName()); 985 setSynonyms(template.getSynonyms()); 986 setTaxonomyRank(template.getTaxonomyRank()); 987 setDivision(template.getDivision()); 988 setEMBL_Code(template.getEMBL_Code()); 989 setGeneticCode(template.getGeneticCode()); 990 setInheritedGeneticCodeFlag(template.getInheritedGeneticCodeFlag()); 991 setMitochondrialGeneticCode(template.getMitochondrialGeneticCode()); 992 setInheritedMitochondrialGeneticCodeFlag(template.getInheritedMitochondrialGeneticCodeFlag()); 993 setGenBankHiddenFlag(template.getGenBankHiddenFlag()); 994 setHiddenSubtreeRootFlag(template.getHiddenSubtreeRootFlag()); 995 setComments(template.getComments()); 996 } 997 998 mInitialized = true; 999 } 1000 1001 //########################################################################### 1002 // INNER CLASS 1003 //########################################################################### 1004 1005 private abstract static class InternalDataSource extends NCBITaxonomyDataSourceImpl 1006 { 1007 private String mNodesRsrc; 1008 private String mNamesRsrc; 1009 1010 //----------------------------------------------------------------------- 1011 public InternalDataSource(String inNodesRsrc, String inNamesRsrc) 1012 { 1013 mNodesRsrc = inNodesRsrc; 1014 mNamesRsrc = inNamesRsrc; 1015 } 1016 1017 1018 //-------------------------------------------------------------------------- 1019 protected synchronized void initialize() 1020 { 1021 parseNodesFile(); 1022 parseNamesFile(); 1023 // TODO: Trim the maps to conserve space? 1024 } 1025 1026 //-------------------------------------------------------------------------- 1027 private void parseNodesFile() 1028 { 1029 try 1030 { 1031 BufferedReader nodeReader = null; 1032 1033 try 1034 { 1035 nodeReader = getNodesReader(); 1036 1037 innerParseNodesFile(nodeReader); 1038 } 1039 finally 1040 { 1041 if (nodeReader != null) nodeReader.close(); 1042 } 1043 } 1044 catch (IOException e) 1045 { 1046 throw new RuntimeException("Error parsing node file.", e); 1047 } 1048 } 1049 1050 //-------------------------------------------------------------------------- 1051 private void parseNamesFile() 1052 { 1053 try 1054 { 1055 BufferedReader namesReader = null; 1056 1057 try 1058 { 1059 namesReader = getNamesReader(); 1060 1061 innerParseNamesFile(namesReader); 1062 } 1063 finally 1064 { 1065 if (namesReader != null) namesReader.close(); 1066 } 1067 } 1068 catch (IOException e) 1069 { 1070 throw new RuntimeException("Error parsing node file.", e); 1071 } 1072 } 1073 1074 1075 //-------------------------------------------------------------------------- 1076 private BufferedReader getNodesReader() 1077 throws IOException 1078 { 1079 LOGGER.log(Level.FINE, "Initializing NCBI taxon data source from nodes file " 1080 + StringUtil.singleQuote(mNodesRsrc)); 1081 InputStream stream = getResourceStream(mNodesRsrc); 1082 1083 return new BufferedReader(new InputStreamReader(stream), 1024 * 8); 1084 } 1085 1086 //-------------------------------------------------------------------------- 1087 private BufferedReader getNamesReader() 1088 throws IOException 1089 { 1090 InputStream stream = getResourceStream(mNamesRsrc); 1091 1092 return new BufferedReader(new InputStreamReader(stream), 1024 * 8); 1093 } 1094 1095 //-------------------------------------------------------------------------- 1096 private static InputStream getResourceStream(String inResource) 1097 throws IOException 1098 { 1099 InputStream stream = NCBITaxon.class.getResourceAsStream(inResource); 1100 if (null == stream) 1101 { 1102 throw new RuntimeException("'" + inResource + "' couldn't be found!"); 1103 } 1104 1105 if (inResource.endsWith(".gz")) 1106 { 1107 stream = new GZIPInputStream(stream); 1108 } 1109 1110 return stream; 1111 } 1112 } 1113 1114 //########################################################################### 1115 // INNER CLASS 1116 //########################################################################### 1117 1118 private static class CommonInternalDataSource extends InternalDataSource 1119 { 1120 //----------------------------------------------------------------------- 1121 public CommonInternalDataSource() 1122 { 1123 super(COMMON_NODES_FILE, COMMON_NAMES_FILE); 1124 } 1125 } 1126 //########################################################################### 1127 // INNER CLASS 1128 //########################################################################### 1129 1130 private static class FullInternalDataSource extends InternalDataSource 1131 { 1132 //----------------------------------------------------------------------- 1133 public FullInternalDataSource() 1134 { 1135 super(NODES_FILE, NAMES_FILE); 1136 } 1137 } 1138}