001package com.hfg.bio.taxonomy.ncbi;
002
003import java.io.BufferedReader;
004import java.io.IOException;
005import java.util.HashMap;
006import java.util.HashSet;
007import java.util.Map;
008import java.util.Set;
009import java.util.StringTokenizer;
010import java.util.logging.Level;
011import java.util.logging.Logger;
012import java.util.regex.Matcher;
013import java.util.regex.Pattern;
014
015import com.hfg.util.BooleanUtil;
016import com.hfg.util.StringUtil;
017
018//------------------------------------------------------------------------------
019/**
020 Base class for implementing an NCBI taxonomy data source.
021 <div>
022  @author J. Alex Taylor, hairyfatguy.com
023 </div>
024 */
025//------------------------------------------------------------------------------
026// com.hfg Library
027//
028// This library is free software; you can redistribute it and/or
029// modify it under the terms of the GNU Lesser General Public
030// License as published by the Free Software Foundation; either
031// version 2.1 of the License, or (at your option) any later version.
032//
033// This library is distributed in the hope that it will be useful,
034// but WITHOUT ANY WARRANTY; without even the implied warranty of
035// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
036// Lesser General Public License for more details.
037//
038// You should have received a copy of the GNU Lesser General Public
039// License along with this library; if not, write to the Free Software
040// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
041//
042// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
043// jataylor@hairyfatguy.com
044//------------------------------------------------------------------------------
045
046public abstract class NCBITaxonomyDataSourceImpl implements NCBITaxonomyDataSource
047{
048   private static final Logger LOGGER = Logger.getLogger(NCBIRemoteTaxonomyDataSource.class.getPackage().getName());
049
050   protected static final Pattern sSnCnPattern = Pattern.compile("^(.*?)\\s+\\((.*?)\\)$");
051
052   // The name map uses an Object for the value since it could be a single NCBITaxon
053   // or it could be a Set of multiple NCBITaxons.
054   private Map<String, Object> mNameMap;
055   private Map<Integer, NCBITaxon> mIdMap;
056
057   protected abstract void initialize();
058
059   //-----------------------------------------------------------------------
060   @Override
061   public NCBITaxon getByTaxonId(int inValue)
062   {
063      if (null == mIdMap)
064      {
065         initialize();
066      }
067
068      return mIdMap.get(inValue);
069   }
070
071   //-----------------------------------------------------------------------
072   @Override
073   public Set<NCBITaxon> getByName(String inValue)
074   {
075      if (null == mNameMap)
076      {
077         initialize();
078      }
079
080
081      Object value = mNameMap.get(inValue);
082      if (null == value)
083      {
084         // Didn't find it? Some sources have the scientific name
085         // followed by the common name in parenthesis. ex: 'Homo sapiens (human)'
086         // If both parts return the same taxon, call it a match.
087         Matcher m = sSnCnPattern.matcher(inValue);
088         if (m.matches())
089         {
090            value = mNameMap.get(m.group(1));
091            if (value != null
092                && value != mNameMap.get(m.group(2)))
093            {
094               value = null;
095            }
096         }
097      }
098
099      Set<NCBITaxon> values;
100      if (value instanceof NCBITaxon)
101      {
102         values = new HashSet<>(1);
103         values.add((NCBITaxon) value);
104      }
105      else
106      {
107         values = (Set<NCBITaxon>) value;
108      }
109
110      return values;
111   }
112
113   //--------------------------------------------------------------------------
114   protected void innerParseNodesFile(BufferedReader inReader)
115         throws IOException
116   {
117      if (null == mIdMap)
118      {
119         mIdMap = new HashMap<>();
120      }
121
122      int lineCount = 0;
123      String line;
124      while ((line = inReader.readLine()) != null)
125      {
126         lineCount++;
127
128         // The StringTokenizer actually seems to perfom slightly better than split() here.
129         StringTokenizer st = new StringTokenizer(line, "|");
130
131         if (st.countTokens() != 13)
132         {
133            throw new RuntimeException("Found " + st.countTokens()
134                                       + " fields instead of 13 on line "
135                                       + lineCount + ": " + StringUtil.singleQuote(line));
136         }
137
138         try
139         {
140            int taxonId = Integer.parseInt(st.nextToken().trim());
141            NCBITaxon taxon = mIdMap.get(taxonId);
142            if (null == taxon)
143            {
144               taxon = new NCBITaxon(taxonId);
145               mIdMap.put(taxonId, taxon);
146            }
147
148            taxon.setParentTaxonId(Integer.parseInt(st.nextToken().trim()));
149
150            String nodeRankString = st.nextToken().trim();
151            NCBITaxonNodeRank nodeRank = NCBITaxonNodeRank.valueOf(nodeRankString);
152            if (null == nodeRank)
153            {
154               throw new RuntimeException("Unrecognized taxonomy rank: " + nodeRankString
155                                          + "\nNode file line " + lineCount + ": '" + line + "'");
156            }
157            taxon.setTaxonomyRank(nodeRank);
158
159            taxon.setEMBL_Code(st.nextToken().trim()); // EMBL code
160
161            int divisionId = Integer.parseInt(st.nextToken().trim());
162            NCBIGenBankDivision division = NCBIGenBankDivision.valueOf(divisionId);
163            if (null == division)
164            {
165               throw new RuntimeException("Unrecognized GenBank division: " + divisionId
166                                          + "\nNode file line " + lineCount + ": '" + line + "'");
167            }
168
169            taxon.setDivision(division);
170
171            taxon.setInheritedDivisionFlag(BooleanUtil.valueOf(st.nextToken().trim()));
172
173            String geneticCodeString = st.nextToken().trim();
174            if (StringUtil.isSet(geneticCodeString))
175            {
176               int geneticCodeId = Integer.parseInt(geneticCodeString);
177               taxon.setGeneticCode(NCBIGeneticCode.getById(geneticCodeId));
178            }
179
180            taxon.setInheritedGeneticCodeFlag(BooleanUtil.valueOf(st.nextToken().trim()));
181
182            String mitoGeneticCodeString = st.nextToken().trim();
183            if (StringUtil.isSet(mitoGeneticCodeString))
184            {
185               int geneticCodeId = Integer.parseInt(mitoGeneticCodeString);
186               taxon.setMitochondrialGeneticCode(NCBIGeneticCode.getById(geneticCodeId));
187            }
188
189            taxon.setInheritedMitochondrialGeneticCodeFlag(BooleanUtil.valueOf(st.nextToken().trim()));
190            taxon.setGenBankHiddenFlag(BooleanUtil.valueOf(st.nextToken().trim()));
191            taxon.setHiddenSubtreeRootFlag(BooleanUtil.valueOf(st.nextToken().trim()));
192            taxon.setComments(st.nextToken().trim());
193         }
194         catch (Exception e)
195         {
196            throw new RuntimeException("Error parsing nodes line " + lineCount
197                                       + ": " + StringUtil.singleQuote(line), e);
198         }
199      }
200
201      LOGGER.log(Level.FINE, mIdMap.size() + " taxons loaded");
202
203   }
204
205
206   //--------------------------------------------------------------------------
207   protected void innerParseNamesFile(BufferedReader inReader)
208      throws IOException
209   {
210      if (null == mNameMap)
211      {
212         mNameMap = new HashMap<>();
213      }
214
215      if (null == mIdMap)
216      {
217         mIdMap = new HashMap<>();
218      }
219
220      int lineCount = 0;
221      String line;
222      while ((line = inReader.readLine()) != null)
223      {
224         lineCount++;
225         String pieces[] = line.split("\\|");
226
227         if (pieces.length != 4)
228         {
229            System.err.println("Found " + pieces.length + " fields instead of 4 on line "
230                               + lineCount + ": " + StringUtil.singleQuote(line));
231            continue;
232         }
233
234         try
235         {
236            int taxonId = Integer.parseInt(pieces[0].trim());
237            NCBITaxon taxon = mIdMap.get(taxonId);
238            if (null == taxon)
239            {
240//            throw new RuntimeException("No taxon found for id " + taxonId);
241               taxon = new NCBITaxon(taxonId);
242               mIdMap.put(taxonId, taxon);
243            }
244
245            String name = pieces[1].trim();
246
247            // pieces[2] is EMBL code
248
249            String nameClassString = pieces[3].trim();
250            NCBITaxonNameClass nameClass = NCBITaxonNameClass.valueOf(nameClassString);
251            if (null == nameClass)
252            {
253               throw new RuntimeException("Unrecognized name class: " + nameClassString
254                                          + "\nNames file line " + lineCount + ": '" + line + "'");
255            }
256
257
258            if (nameClass == NCBITaxonNameClass.SCIENTIFIC_NAME)
259            {
260               taxon.setScientificName(name);
261               addToNameMap(name, taxon);
262            }
263            else if (nameClass == NCBITaxonNameClass.COMMON_NAME)
264            {
265               taxon.setCommonName(name);
266               addToNameMap(name, taxon);
267            }
268            else if (nameClass == NCBITaxonNameClass.GENBANK_COMMON_NAME)
269            {
270               taxon.setGenBankCommonName(name);
271               addToNameMap(name, taxon);
272            }
273            else if (nameClass == NCBITaxonNameClass.SYNONYM)
274            {
275               taxon.addSynonym(name);
276               addToNameMap(name, taxon);
277            }
278
279            // Every name should be used as a reference to the taxon.
280//            mNameMap.put(name.toLowerCase(), taxon);
281         }
282         catch (Exception e)
283         {
284            throw new RuntimeException("Error parsing line " + lineCount
285                                       + ": " + StringUtil.singleQuote(line), e);
286         }
287      }
288
289      LOGGER.log(Level.FINE, mNameMap.size() + " names loaded");
290   }
291
292   //--------------------------------------------------------------------------
293   protected void addToNameMap(String inName, NCBITaxon inTaxon)
294   {
295      String key = inName.toLowerCase();
296
297      Object existingValue = mNameMap.get(key);
298      if (existingValue != null)
299      {
300         Set<NCBITaxon> set;
301         if (existingValue instanceof NCBITaxon)
302         {
303            set = new HashSet<>(2);
304            set.add((NCBITaxon) existingValue);
305            mNameMap.put(key, set);
306         }
307         else
308         {
309            set = (Set<NCBITaxon>) existingValue;
310         }
311
312         set.add(inTaxon);
313      }
314      else
315      {
316         mNameMap.put(key, inTaxon);
317      }
318   }
319}