001package com.hfg.bio.taxonomy.ncbi; 002 003import java.io.BufferedReader; 004import java.io.IOException; 005import java.util.HashMap; 006import java.util.HashSet; 007import java.util.Map; 008import java.util.Set; 009import java.util.StringTokenizer; 010import java.util.logging.Level; 011import java.util.logging.Logger; 012import java.util.regex.Matcher; 013import java.util.regex.Pattern; 014 015import com.hfg.util.BooleanUtil; 016import com.hfg.util.StringUtil; 017 018//------------------------------------------------------------------------------ 019/** 020 Base class for implementing an NCBI taxonomy data source. 021 <div> 022 @author J. Alex Taylor, hairyfatguy.com 023 </div> 024 */ 025//------------------------------------------------------------------------------ 026// com.hfg Library 027// 028// This library is free software; you can redistribute it and/or 029// modify it under the terms of the GNU Lesser General Public 030// License as published by the Free Software Foundation; either 031// version 2.1 of the License, or (at your option) any later version. 032// 033// This library is distributed in the hope that it will be useful, 034// but WITHOUT ANY WARRANTY; without even the implied warranty of 035// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 036// Lesser General Public License for more details. 037// 038// You should have received a copy of the GNU Lesser General Public 039// License along with this library; if not, write to the Free Software 040// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 041// 042// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 043// jataylor@hairyfatguy.com 044//------------------------------------------------------------------------------ 045 046public abstract class NCBITaxonomyDataSourceImpl implements NCBITaxonomyDataSource 047{ 048 private static final Logger LOGGER = Logger.getLogger(NCBIRemoteTaxonomyDataSource.class.getPackage().getName()); 049 050 protected static final Pattern sSnCnPattern = Pattern.compile("^(.*?)\\s+\\((.*?)\\)$"); 051 052 // The name map uses an Object for the value since it could be a single NCBITaxon 053 // or it could be a Set of multiple NCBITaxons. 054 private Map<String, Object> mNameMap; 055 private Map<Integer, NCBITaxon> mIdMap; 056 057 protected abstract void initialize(); 058 059 //----------------------------------------------------------------------- 060 @Override 061 public NCBITaxon getByTaxonId(int inValue) 062 { 063 if (null == mIdMap) 064 { 065 initialize(); 066 } 067 068 return mIdMap.get(inValue); 069 } 070 071 //----------------------------------------------------------------------- 072 @Override 073 public Set<NCBITaxon> getByName(String inValue) 074 { 075 if (null == mNameMap) 076 { 077 initialize(); 078 } 079 080 081 Object value = mNameMap.get(inValue); 082 if (null == value) 083 { 084 // Didn't find it? Some sources have the scientific name 085 // followed by the common name in parenthesis. ex: 'Homo sapiens (human)' 086 // If both parts return the same taxon, call it a match. 087 Matcher m = sSnCnPattern.matcher(inValue); 088 if (m.matches()) 089 { 090 value = mNameMap.get(m.group(1)); 091 if (value != null 092 && value != mNameMap.get(m.group(2))) 093 { 094 value = null; 095 } 096 } 097 } 098 099 Set<NCBITaxon> values; 100 if (value instanceof NCBITaxon) 101 { 102 values = new HashSet<>(1); 103 values.add((NCBITaxon) value); 104 } 105 else 106 { 107 values = (Set<NCBITaxon>) value; 108 } 109 110 return values; 111 } 112 113 //-------------------------------------------------------------------------- 114 protected void innerParseNodesFile(BufferedReader inReader) 115 throws IOException 116 { 117 if (null == mIdMap) 118 { 119 mIdMap = new HashMap<>(); 120 } 121 122 int lineCount = 0; 123 String line; 124 while ((line = inReader.readLine()) != null) 125 { 126 lineCount++; 127 128 // The StringTokenizer actually seems to perfom slightly better than split() here. 129 StringTokenizer st = new StringTokenizer(line, "|"); 130 131 if (st.countTokens() != 13) 132 { 133 throw new RuntimeException("Found " + st.countTokens() 134 + " fields instead of 13 on line " 135 + lineCount + ": " + StringUtil.singleQuote(line)); 136 } 137 138 try 139 { 140 int taxonId = Integer.parseInt(st.nextToken().trim()); 141 NCBITaxon taxon = mIdMap.get(taxonId); 142 if (null == taxon) 143 { 144 taxon = new NCBITaxon(taxonId); 145 mIdMap.put(taxonId, taxon); 146 } 147 148 taxon.setParentTaxonId(Integer.parseInt(st.nextToken().trim())); 149 150 String nodeRankString = st.nextToken().trim(); 151 NCBITaxonNodeRank nodeRank = NCBITaxonNodeRank.valueOf(nodeRankString); 152 if (null == nodeRank) 153 { 154 throw new RuntimeException("Unrecognized taxonomy rank: " + nodeRankString 155 + "\nNode file line " + lineCount + ": '" + line + "'"); 156 } 157 taxon.setTaxonomyRank(nodeRank); 158 159 taxon.setEMBL_Code(st.nextToken().trim()); // EMBL code 160 161 int divisionId = Integer.parseInt(st.nextToken().trim()); 162 NCBIGenBankDivision division = NCBIGenBankDivision.valueOf(divisionId); 163 if (null == division) 164 { 165 throw new RuntimeException("Unrecognized GenBank division: " + divisionId 166 + "\nNode file line " + lineCount + ": '" + line + "'"); 167 } 168 169 taxon.setDivision(division); 170 171 taxon.setInheritedDivisionFlag(BooleanUtil.valueOf(st.nextToken().trim())); 172 173 String geneticCodeString = st.nextToken().trim(); 174 if (StringUtil.isSet(geneticCodeString)) 175 { 176 int geneticCodeId = Integer.parseInt(geneticCodeString); 177 taxon.setGeneticCode(NCBIGeneticCode.getById(geneticCodeId)); 178 } 179 180 taxon.setInheritedGeneticCodeFlag(BooleanUtil.valueOf(st.nextToken().trim())); 181 182 String mitoGeneticCodeString = st.nextToken().trim(); 183 if (StringUtil.isSet(mitoGeneticCodeString)) 184 { 185 int geneticCodeId = Integer.parseInt(mitoGeneticCodeString); 186 taxon.setMitochondrialGeneticCode(NCBIGeneticCode.getById(geneticCodeId)); 187 } 188 189 taxon.setInheritedMitochondrialGeneticCodeFlag(BooleanUtil.valueOf(st.nextToken().trim())); 190 taxon.setGenBankHiddenFlag(BooleanUtil.valueOf(st.nextToken().trim())); 191 taxon.setHiddenSubtreeRootFlag(BooleanUtil.valueOf(st.nextToken().trim())); 192 taxon.setComments(st.nextToken().trim()); 193 } 194 catch (Exception e) 195 { 196 throw new RuntimeException("Error parsing nodes line " + lineCount 197 + ": " + StringUtil.singleQuote(line), e); 198 } 199 } 200 201 LOGGER.log(Level.FINE, mIdMap.size() + " taxons loaded"); 202 203 } 204 205 206 //-------------------------------------------------------------------------- 207 protected void innerParseNamesFile(BufferedReader inReader) 208 throws IOException 209 { 210 if (null == mNameMap) 211 { 212 mNameMap = new HashMap<>(); 213 } 214 215 if (null == mIdMap) 216 { 217 mIdMap = new HashMap<>(); 218 } 219 220 int lineCount = 0; 221 String line; 222 while ((line = inReader.readLine()) != null) 223 { 224 lineCount++; 225 String pieces[] = line.split("\\|"); 226 227 if (pieces.length != 4) 228 { 229 System.err.println("Found " + pieces.length + " fields instead of 4 on line " 230 + lineCount + ": " + StringUtil.singleQuote(line)); 231 continue; 232 } 233 234 try 235 { 236 int taxonId = Integer.parseInt(pieces[0].trim()); 237 NCBITaxon taxon = mIdMap.get(taxonId); 238 if (null == taxon) 239 { 240// throw new RuntimeException("No taxon found for id " + taxonId); 241 taxon = new NCBITaxon(taxonId); 242 mIdMap.put(taxonId, taxon); 243 } 244 245 String name = pieces[1].trim(); 246 247 // pieces[2] is EMBL code 248 249 String nameClassString = pieces[3].trim(); 250 NCBITaxonNameClass nameClass = NCBITaxonNameClass.valueOf(nameClassString); 251 if (null == nameClass) 252 { 253 throw new RuntimeException("Unrecognized name class: " + nameClassString 254 + "\nNames file line " + lineCount + ": '" + line + "'"); 255 } 256 257 258 if (nameClass == NCBITaxonNameClass.SCIENTIFIC_NAME) 259 { 260 taxon.setScientificName(name); 261 addToNameMap(name, taxon); 262 } 263 else if (nameClass == NCBITaxonNameClass.COMMON_NAME) 264 { 265 taxon.setCommonName(name); 266 addToNameMap(name, taxon); 267 } 268 else if (nameClass == NCBITaxonNameClass.GENBANK_COMMON_NAME) 269 { 270 taxon.setGenBankCommonName(name); 271 addToNameMap(name, taxon); 272 } 273 else if (nameClass == NCBITaxonNameClass.SYNONYM) 274 { 275 taxon.addSynonym(name); 276 addToNameMap(name, taxon); 277 } 278 279 // Every name should be used as a reference to the taxon. 280// mNameMap.put(name.toLowerCase(), taxon); 281 } 282 catch (Exception e) 283 { 284 throw new RuntimeException("Error parsing line " + lineCount 285 + ": " + StringUtil.singleQuote(line), e); 286 } 287 } 288 289 LOGGER.log(Level.FINE, mNameMap.size() + " names loaded"); 290 } 291 292 //-------------------------------------------------------------------------- 293 protected void addToNameMap(String inName, NCBITaxon inTaxon) 294 { 295 String key = inName.toLowerCase(); 296 297 Object existingValue = mNameMap.get(key); 298 if (existingValue != null) 299 { 300 Set<NCBITaxon> set; 301 if (existingValue instanceof NCBITaxon) 302 { 303 set = new HashSet<>(2); 304 set.add((NCBITaxon) existingValue); 305 mNameMap.put(key, set); 306 } 307 else 308 { 309 set = (Set<NCBITaxon>) existingValue; 310 } 311 312 set.add(inTaxon); 313 } 314 else 315 { 316 mNameMap.put(key, inTaxon); 317 } 318 } 319}