001package com.hfg.bio.taxonomy.uniprot; 002 003import com.hfg.util.StringUtil; 004 005import java.io.File; 006import java.io.FileInputStream; 007import java.io.InputStream; 008import java.io.IOException; 009import java.io.BufferedReader; 010import java.io.InputStreamReader; 011import java.util.Map; 012import java.util.HashMap; 013import java.util.regex.Matcher; 014import java.util.regex.Pattern; 015import java.util.zip.GZIPInputStream; 016 017 018//------------------------------------------------------------------------------ 019/** 020 * Species class for Uniprot codes. Uses the speclist.txt provided with Uniprot 021 * to map the species codes that are a part of the locus name (Ex: 'HUMAN' in the locus 'TNF_HUMAN'). 022 * <div> 023 * @author J. Alex Taylor, hairyfatguy.com 024 * </div> 025 */ 026//------------------------------------------------------------------------------ 027// com.hfg XML/HTML Coding Library 028// 029// This library is free software; you can redistribute it and/or 030// modify it under the terms of the GNU Lesser General Public 031// License as published by the Free Software Foundation; either 032// version 2.1 of the License, or (at your option) any later version. 033// 034// This library is distributed in the hope that it will be useful, 035// but WITHOUT ANY WARRANTY; without even the implied warranty of 036// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 037// Lesser General Public License for more details. 038// 039// You should have received a copy of the GNU Lesser General Public 040// License along with this library; if not, write to the Free Software 041// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 042// 043// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 044// jataylor@hairyfatguy.com 045//------------------------------------------------------------------------------ 046 047public class UniprotSpecies 048{ 049 050 //************************************************************************** 051 // PRIVATE FIELDS 052 //************************************************************************** 053 054 private String mSpeciesCode; 055 private char mKingdomCode; 056 private Integer mTaxonId; 057 private String mScientificName; 058 private String mCommonName; 059 private String mSynonym; 060 061 062 private static Map<String, UniprotSpecies> sCodeIndex; 063 private static File sSpeciesFile; 064 065 private static final String SPECIES_FILE = "rsrc/speclist.txt.gz"; 066 067 //************************************************************************** 068 // CONSTRUCTORS 069 //************************************************************************** 070 071 //-------------------------------------------------------------------------- 072 private UniprotSpecies(String inCode) 073 { 074 mSpeciesCode = inCode; 075 sCodeIndex.put(inCode, this); 076 } 077 078 //************************************************************************** 079 // PUBLIC METHODS 080 //************************************************************************** 081 082 //-------------------------------------------------------------------------- 083 /** 084 * Used to load a newer version of the speclist.txt file. The file may be 085 * gzip compressed. File found at 086 * ftp://www.expasy.ch/databases/uniprot/knowledgebase/docs/speclist.txt 087 @param inValue the file to use as the source of Uniprot species data 088 */ 089 public static void setSpeciesListFile(File inValue) 090 { 091 sSpeciesFile = inValue; 092 093 // Clear the indexes so that the new file will be loaded 094 sCodeIndex = null; 095 } 096 097 098 //-------------------------------------------------------------------------- 099 /** 100 Retrieves the UniprotSpecies for the specified loucs (Ex: 'TNF_HUMAN'). 101 @param inValue the Uniprot locus name for the species object to return 102 @return the species object corresponding to the specified Uniprot locus. 103 */ 104 public static UniprotSpecies getByLocus(String inValue) 105 { 106 initialize(); 107 108 Pattern p = Pattern.compile("^\\S+_(\\S+)$"); 109 Matcher m = p.matcher(inValue); 110 if (!m.matches()) 111 { 112 throw new RuntimeException("The locus '" + inValue + "' is not in the proper format!"); 113 } 114 115 return sCodeIndex.get(m.group(1)); 116 } 117 118 //-------------------------------------------------------------------------- 119 /** 120 Retrieves the UniprotSpecies for the specified species code (Ex: 'HUMAN' from the locus 'TNF_HUMAN'). 121 @param inValue the Uniprot code for the species object to return 122 @return the species object corresponding to the specified Uniprot code. 123 */ 124 public static UniprotSpecies getByCode(String inValue) 125 { 126 initialize(); 127 128 return (StringUtil.isSet(inValue) ? sCodeIndex.get(inValue.toUpperCase()) : null); 129 } 130 131 //-------------------------------------------------------------------------- 132 public String getSpeciesCode() 133 { 134 return mSpeciesCode; 135 } 136 137 //-------------------------------------------------------------------------- 138 public char getKingdomCode() 139 { 140 return mKingdomCode; 141 } 142 143 //-------------------------------------------------------------------------- 144 /** 145 Returns the taxon id. The value is equal to the NCBI taxon id. 146 @return the taxon id 147 */ 148 public Integer getTaxonId() 149 { 150 return mTaxonId; 151 } 152 153 //-------------------------------------------------------------------------- 154 public String getScientificName() 155 { 156 return mScientificName; 157 } 158 159 //-------------------------------------------------------------------------- 160 public String getCommonName() 161 { 162 return mCommonName; 163 } 164 165 //-------------------------------------------------------------------------- 166 public String getSynonym() 167 { 168 return mSynonym; 169 } 170 171 172 //************************************************************************** 173 // PRIVATE METHODS 174 //************************************************************************** 175 176 177 //-------------------------------------------------------------------------- 178 private static void initialize() 179 { 180 if (null == sCodeIndex) 181 { 182 sCodeIndex = new HashMap<String, UniprotSpecies>(15000); 183 184 parseSpeciesFile(); 185 186 System.out.println(sCodeIndex.size() + " Uniprot species codes loaded"); 187 } 188 } 189 190 191 //-------------------------------------------------------------------------- 192 private void setScientificName(String inValue) 193 { 194 mScientificName = inValue; 195 } 196 197 //-------------------------------------------------------------------------- 198 private void setCommonName(String inValue) 199 { 200 mCommonName = inValue; 201 } 202 203 //-------------------------------------------------------------------------- 204 private void setSynonym(String inValue) 205 { 206 mSynonym = inValue; 207 } 208 209 //-------------------------------------------------------------------------- 210 private void setTaxonId(int inValue) 211 { 212 mTaxonId = inValue; 213 } 214 215 //-------------------------------------------------------------------------- 216 private void setKingdomCode(char inValue) 217 { 218 mKingdomCode = inValue; 219 } 220 221 //-------------------------------------------------------------------------- 222 private static InputStream getFileStream(File inFile) 223 throws IOException 224 { 225 if (!inFile.exists()) 226 { 227 throw new RuntimeException("'" + inFile + "' doesn't exist!"); 228 } 229 230 InputStream stream = new FileInputStream(inFile); 231 if (inFile.getName().endsWith(".gz")) 232 { 233 stream = new GZIPInputStream(stream); 234 } 235 236 return stream; 237 } 238 239 //-------------------------------------------------------------------------- 240 private static InputStream getResourceStream(String inResource) 241 throws IOException 242 { 243 InputStream stream = UniprotSpecies.class.getResourceAsStream(inResource); 244 if (null == stream) 245 { 246 throw new RuntimeException("'" + inResource + "' couldn't be found!"); 247 } 248 249 if (inResource.endsWith(".gz")) 250 { 251 stream = new GZIPInputStream(stream); 252 } 253 254 return stream; 255 } 256 257 //-------------------------------------------------------------------------- 258 private static BufferedReader getFileStream() 259 throws IOException 260 { 261 InputStream stream; 262 263 if (sSpeciesFile != null) 264 { 265 stream = getFileStream(sSpeciesFile); 266 } 267 else 268 { 269 stream = getResourceStream(SPECIES_FILE); 270 } 271 272 return new BufferedReader(new InputStreamReader(stream)); 273 } 274 275 //-------------------------------------------------------------------------- 276 private static void parseSpeciesFile() 277 { 278 UniprotSpecies entry = null; 279 Pattern nLinePattern = Pattern.compile("^(\\S{3,5})\\s+(\\w)\\s+(\\d+|\\?+):\\s+N=(.+)"); 280 Pattern cLinePattern = Pattern.compile("^\\s+C=(.+)"); 281 Pattern sLinePattern = Pattern.compile("^\\s+S=(.+)"); 282 283 int lineCount = 0; 284 285 try 286 { 287 BufferedReader fileReader = null; 288 289 try 290 { 291 fileReader = getFileStream(); 292 293 boolean inHeader = true; 294 String line; 295 while ((line = fileReader.readLine()) != null) 296 { 297 lineCount++; 298 299 if (inHeader) 300 { 301 if (line.startsWith("_____ ")) 302 { 303 inHeader = false; 304 continue; 305 } 306 } 307 else if (line.startsWith("--------")) 308 { 309 // Hit the copyright at the end. 310 break; 311 } 312 313 Matcher m = nLinePattern.matcher(line); 314 if (m.matches()) 315 { 316 entry = new UniprotSpecies(m.group(1)); 317 sCodeIndex.put(m.group(1), entry); 318 319 entry.setKingdomCode(m.group(2).charAt(0)); 320 if (!m.group(3).startsWith("?")) 321 { 322 entry.setTaxonId(Integer.parseInt(m.group(3))); 323 } 324 entry.setScientificName(m.group(4)); 325 } 326 else if (entry != null) 327 { 328 m = cLinePattern.matcher(line); 329 if (m.matches()) 330 { 331 entry.setCommonName(m.group(1)); 332 } 333 else 334 { 335 m = sLinePattern.matcher(line); 336 if (m.matches()) 337 { 338 entry.setSynonym(m.group(1)); 339 } 340 } 341 } 342 } 343 } 344 finally 345 { 346 if (fileReader != null) fileReader.close(); 347 } 348 } 349 catch (IOException e) 350 { 351 throw new RuntimeException("Error parsing species file. line: " + lineCount, e); 352 } 353 } 354 355}