001package com.hfg.bio.taxonomy.uniprot;
002
003import com.hfg.util.StringUtil;
004
005import java.io.File;
006import java.io.FileInputStream;
007import java.io.InputStream;
008import java.io.IOException;
009import java.io.BufferedReader;
010import java.io.InputStreamReader;
011import java.util.Map;
012import java.util.HashMap;
013import java.util.regex.Matcher;
014import java.util.regex.Pattern;
015import java.util.zip.GZIPInputStream;
016
017
018//------------------------------------------------------------------------------
019/**
020 * Species class for Uniprot codes. Uses the speclist.txt provided with Uniprot
021 * to map the species codes that are a part of the locus name (Ex: 'HUMAN' in the locus 'TNF_HUMAN').
022 * <div>
023 *  @author J. Alex Taylor, hairyfatguy.com
024 * </div>
025 */
026//------------------------------------------------------------------------------
027// com.hfg XML/HTML Coding Library
028//
029// This library is free software; you can redistribute it and/or
030// modify it under the terms of the GNU Lesser General Public
031// License as published by the Free Software Foundation; either
032// version 2.1 of the License, or (at your option) any later version.
033//
034// This library is distributed in the hope that it will be useful,
035// but WITHOUT ANY WARRANTY; without even the implied warranty of
036// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
037// Lesser General Public License for more details.
038//
039// You should have received a copy of the GNU Lesser General Public
040// License along with this library; if not, write to the Free Software
041// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
042//
043// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
044// jataylor@hairyfatguy.com
045//------------------------------------------------------------------------------
046
047public class UniprotSpecies
048{
049
050   //**************************************************************************
051   // PRIVATE FIELDS
052   //**************************************************************************
053
054   private String              mSpeciesCode;
055   private char                mKingdomCode;
056   private Integer             mTaxonId;
057   private String              mScientificName;
058   private String              mCommonName;
059   private String              mSynonym;
060
061
062   private static Map<String, UniprotSpecies> sCodeIndex;
063   private static File                        sSpeciesFile;
064
065   private static final String SPECIES_FILE = "rsrc/speclist.txt.gz";
066
067   //**************************************************************************
068   // CONSTRUCTORS
069   //**************************************************************************
070
071   //--------------------------------------------------------------------------
072   private UniprotSpecies(String inCode)
073   {
074      mSpeciesCode = inCode;
075      sCodeIndex.put(inCode, this);
076   }
077
078   //**************************************************************************
079   // PUBLIC METHODS
080   //**************************************************************************
081
082   //--------------------------------------------------------------------------
083   /**
084    * Used to load a newer version of the speclist.txt file. The file may be
085    * gzip compressed. File found at
086    * ftp://www.expasy.ch/databases/uniprot/knowledgebase/docs/speclist.txt
087    @param inValue the file to use as the source of Uniprot species data
088    */
089   public static void setSpeciesListFile(File inValue)
090   {
091      sSpeciesFile = inValue;
092
093      // Clear the indexes so that the new file will be loaded
094      sCodeIndex = null;
095   }
096
097
098   //--------------------------------------------------------------------------
099   /**
100    Retrieves the UniprotSpecies for the specified loucs (Ex: 'TNF_HUMAN').
101    @param inValue the Uniprot locus name for the species object to return
102    @return the species object corresponding to the specified Uniprot locus.
103    */
104   public static UniprotSpecies getByLocus(String inValue)
105   {
106      initialize();
107
108      Pattern p = Pattern.compile("^\\S+_(\\S+)$");
109      Matcher m = p.matcher(inValue);
110      if (!m.matches())
111      {
112         throw new RuntimeException("The locus '" + inValue + "' is not in the proper format!");
113      }
114
115      return sCodeIndex.get(m.group(1));
116   }
117
118   //--------------------------------------------------------------------------
119   /**
120    Retrieves the UniprotSpecies for the specified species code (Ex: 'HUMAN' from the locus 'TNF_HUMAN').
121    @param inValue the Uniprot code for the species object to return
122    @return the species object corresponding to the specified Uniprot code.
123    */
124   public static UniprotSpecies getByCode(String inValue)
125   {
126      initialize();
127
128      return (StringUtil.isSet(inValue) ? sCodeIndex.get(inValue.toUpperCase()) : null);
129   }
130
131   //--------------------------------------------------------------------------
132   public String getSpeciesCode()
133   {
134      return mSpeciesCode;
135   }
136
137   //--------------------------------------------------------------------------
138   public char getKingdomCode()
139   {
140      return mKingdomCode;
141   }
142
143   //--------------------------------------------------------------------------
144   /**
145    Returns the taxon id. The value is equal to the NCBI taxon id.
146    @return the taxon id
147    */
148   public Integer getTaxonId()
149   {
150      return mTaxonId;
151   }
152
153   //--------------------------------------------------------------------------
154   public String getScientificName()
155   {
156      return mScientificName;
157   }
158
159   //--------------------------------------------------------------------------
160   public String getCommonName()
161   {
162      return mCommonName;
163   }
164
165   //--------------------------------------------------------------------------
166   public String getSynonym()
167   {
168      return mSynonym;
169   }
170
171
172   //**************************************************************************
173   // PRIVATE METHODS
174   //**************************************************************************
175
176
177   //--------------------------------------------------------------------------
178   private static void initialize()
179    {
180       if (null == sCodeIndex)
181       {
182          sCodeIndex = new HashMap<String, UniprotSpecies>(15000);
183
184          parseSpeciesFile();
185
186          System.out.println(sCodeIndex.size() + " Uniprot species codes loaded");
187       }
188    }
189
190
191   //--------------------------------------------------------------------------
192   private void setScientificName(String inValue)
193   {
194      mScientificName = inValue;
195   }
196
197   //--------------------------------------------------------------------------
198   private void setCommonName(String inValue)
199   {
200      mCommonName = inValue;
201   }
202
203   //--------------------------------------------------------------------------
204   private void setSynonym(String inValue)
205   {
206      mSynonym = inValue;
207   }
208
209   //--------------------------------------------------------------------------
210   private void setTaxonId(int inValue)
211   {
212      mTaxonId = inValue;
213   }
214
215   //--------------------------------------------------------------------------
216   private void setKingdomCode(char inValue)
217   {
218      mKingdomCode = inValue;
219   }
220
221   //--------------------------------------------------------------------------
222   private static InputStream getFileStream(File inFile)
223   throws IOException
224   {
225      if (!inFile.exists())
226      {
227         throw new RuntimeException("'" + inFile + "' doesn't exist!");
228      }
229
230      InputStream stream = new FileInputStream(inFile);
231      if (inFile.getName().endsWith(".gz"))
232      {
233         stream = new GZIPInputStream(stream);
234      }
235
236      return stream;
237   }
238
239   //--------------------------------------------------------------------------
240   private static InputStream getResourceStream(String inResource)
241   throws IOException
242   {
243      InputStream stream = UniprotSpecies.class.getResourceAsStream(inResource);
244      if (null == stream)
245      {
246         throw new RuntimeException("'" + inResource + "' couldn't be found!");
247      }
248
249      if (inResource.endsWith(".gz"))
250      {
251         stream = new GZIPInputStream(stream);
252      }
253
254      return stream;
255   }
256
257   //--------------------------------------------------------------------------
258   private static BufferedReader getFileStream()
259   throws IOException
260   {
261      InputStream stream;
262
263      if (sSpeciesFile != null)
264      {
265         stream = getFileStream(sSpeciesFile);
266      }
267      else
268      {
269         stream = getResourceStream(SPECIES_FILE);
270      }
271
272      return new BufferedReader(new InputStreamReader(stream));
273   }
274
275   //--------------------------------------------------------------------------
276   private static void parseSpeciesFile()
277   {
278      UniprotSpecies entry = null;
279      Pattern nLinePattern = Pattern.compile("^(\\S{3,5})\\s+(\\w)\\s+(\\d+|\\?+):\\s+N=(.+)");
280      Pattern cLinePattern = Pattern.compile("^\\s+C=(.+)");
281      Pattern sLinePattern = Pattern.compile("^\\s+S=(.+)");
282
283      int lineCount = 0;
284
285      try
286      {
287         BufferedReader fileReader = null;
288
289         try
290         {
291            fileReader = getFileStream();
292
293            boolean inHeader = true;
294            String line;
295            while ((line = fileReader.readLine()) != null)
296            {
297               lineCount++;
298
299               if (inHeader)
300               {
301                  if (line.startsWith("_____ "))
302                  {
303                     inHeader = false;
304                     continue;
305                  }
306               }
307               else if (line.startsWith("--------"))
308               {
309                  // Hit the copyright at the end.
310                  break;
311               }
312
313               Matcher m = nLinePattern.matcher(line);
314               if (m.matches())
315               {
316                  entry = new UniprotSpecies(m.group(1));
317                  sCodeIndex.put(m.group(1), entry);
318
319                  entry.setKingdomCode(m.group(2).charAt(0));
320                  if (!m.group(3).startsWith("?"))
321                  {
322                     entry.setTaxonId(Integer.parseInt(m.group(3)));
323                  }
324                  entry.setScientificName(m.group(4));
325               }
326               else if (entry != null)
327               {
328                  m = cLinePattern.matcher(line);
329                  if (m.matches())
330                  {
331                     entry.setCommonName(m.group(1));
332                  }
333                  else
334                  {
335                     m = sLinePattern.matcher(line);
336                     if (m.matches())
337                     {
338                        entry.setSynonym(m.group(1));
339                     }
340                  }
341               }
342            }
343         }
344         finally
345         {
346            if (fileReader != null) fileReader.close();
347         }
348      }
349      catch (IOException e)
350      {
351         throw new RuntimeException("Error parsing species file. line: " + lineCount, e);
352      }
353   }
354
355}