001package com.hfg.bio.seq.translation; 002 003import java.io.BufferedReader; 004import java.io.InputStream; 005import java.io.InputStreamReader; 006import java.net.URL; 007import java.net.URLConnection; 008import java.util.*; 009import java.util.regex.Matcher; 010import java.util.regex.Pattern; 011import java.util.zip.GZIPInputStream; 012 013import com.hfg.bio.Nucleotide; 014import com.hfg.bio.taxonomy.ncbi.NCBITaxon; 015import com.hfg.exception.ProgrammingException; 016import com.hfg.math.Range; 017import com.hfg.util.collection.CollectionUtil; 018import com.hfg.util.collection.OrderedSet; 019import com.hfg.util.StringUtil; 020import com.hfg.util.io.StreamUtil; 021import com.hfg.xml.HfgXML; 022import com.hfg.xml.XMLName; 023import com.hfg.xml.XMLTag; 024 025//------------------------------------------------------------------------------ 026/** 027 Table that maps codons to their respective amino acids. 028 <div> 029 @author J. Alex Taylor, hairyfatguy.com 030 </div> 031 */ 032//------------------------------------------------------------------------------ 033// com.hfg Library 034// 035// This library is free software; you can redistribute it and/or 036// modify it under the terms of the GNU Lesser General Public 037// License as published by the Free Software Foundation; either 038// version 2.1 of the License, or (at your option) any later version. 039// 040// This library is distributed in the hope that it will be useful, 041// but WITHOUT ANY WARRANTY; without even the implied warranty of 042// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 043// Lesser General Public License for more details. 044// 045// You should have received a copy of the GNU Lesser General Public 046// License along with this library; if not, write to the Free Software 047// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 048// 049// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 050// jataylor@hairyfatguy.com 051//------------------------------------------------------------------------------ 052 053public class CodonTable implements TranslationTable 054{ 055 056 // XML Tag names 057 public static final XMLName XML_CODON_TABLE = new XMLName("CodonTable", HfgXML.HFG_NAMESPACE); 058 059 // XML Attribute names 060 public static final XMLName XML_NAME_ATT = new XMLName("name", HfgXML.HFG_NAMESPACE); 061 public static final XMLName XML_SPECIES_ATT = new XMLName("species", HfgXML.HFG_NAMESPACE); 062 063 064 public static final CodonTable HUMAN = new CodonTable("codondata/human.cod.xml.gz"); 065 public static final CodonTable MOUSE = new CodonTable("codondata/mouse.cod.xml.gz"); 066 067 068 private static final String sKazusaUrlTemplate = "http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=%d&aa=1&style=N"; 069 070 private static final Pattern sKazusaLinePattern = Pattern.compile("\\s*(\\w{3})\\s+([\\w\\*])\\s+([\\.\\d]+)\\s+([\\.\\d]+)\\s+\\(\\s*(\\d+)\\)"); 071 072 private static Random sRandom = new Random(System.currentTimeMillis()); 073 074 private String mName; 075 private NCBITaxon mTaxon; 076 077 private Set<Codon> mCodons = new OrderedSet<>(65); 078 private Map<Codon, Character> mCodonToAAMap = new HashMap<>(65); 079 private Map<Character, Set<Codon>> mAAToCodonsMap = new HashMap<>(26); 080 081 //########################################################################### 082 // CONSTRUCTORS 083 //########################################################################### 084 085 //--------------------------------------------------------------------------- 086 public CodonTable(NCBITaxon inSpecies) 087 { 088 mTaxon = inSpecies; 089 } 090 091 //--------------------------------------------------------------------------- 092 public CodonTable(XMLTag inXMLTag) 093 { 094 fromXMLTag(inXMLTag); 095 init(); 096 } 097 098 //--------------------------------------------------------------------------- 099 protected CodonTable(String inRsrcPath) 100 { 101 InputStream stream = getClass().getResourceAsStream(inRsrcPath); 102 if (null == stream) 103 { 104 throw new ProgrammingException("The rsrc " + inRsrcPath + " couldn't be found!?"); 105 } 106 107 try 108 { 109 if (inRsrcPath.endsWith(".gz")) 110 { 111 stream = new GZIPInputStream(stream); 112 } 113 114 fromXMLTag(new XMLTag(stream)); 115 116 StreamUtil.close(stream); 117 } 118 catch (Exception e) 119 { 120 throw new ProgrammingException(e); 121 } 122 123 init(); 124 } 125 126 //########################################################################### 127 // PUBLIC METHODS 128 //########################################################################### 129 130 //--------------------------------------------------------------------------- 131 public String name() 132 { 133 return mName; 134 } 135 136 //--------------------------------------------------------------------------- 137 private CodonTable setName(String inValue) 138 { 139 mName = inValue; 140 return this; 141 } 142 143 //--------------------------------------------------------------------------- 144 /** 145 Dynamically retrieves codon usage data from the Codon Usage Database 146 (<a href='http://www.kazusa.or.jp/codon/'>http://www.kazusa.or.jp/codon/</a>) 147 @throws Exception if the data cannot be retrieved 148 */ 149 public void retrieveDataFromKazusa() 150 throws Exception 151 { 152 URL url = new URL(String.format(sKazusaUrlTemplate, mTaxon.getTaxonId())); 153 154 URLConnection connection = url.openConnection(); 155 parseKazusaData(connection.getInputStream()); 156 } 157 158 //--------------------------------------------------------------------------- 159 public XMLTag toXMLTag() 160 { 161 XMLTag tag = new XMLTag(XML_CODON_TABLE); 162 163 if (name() != null) 164 { 165 tag.setAttribute(XML_NAME_ATT, name()); 166 } 167 168 if (mTaxon != null) 169 { 170 tag.setAttribute(XML_SPECIES_ATT, mTaxon.getScientificName()); 171 } 172 173 if (mCodons != null) 174 { 175 for (Codon codon : mCodons) 176 { 177 tag.addSubtag(codon.toXMLTag()); 178 } 179 } 180 181 return tag; 182 } 183 184 //--------------------------------------------------------------------------- 185 public CodonTable addCodon(Codon inValue) 186 { 187 mCodons.add(inValue); 188 189 mCodonToAAMap.put(inValue, inValue.getAA()); 190 191 Set<Codon> codons = mAAToCodonsMap.get(inValue.getAA()); 192 if (null == codons) 193 { 194 codons = new HashSet<>(10); 195 mAAToCodonsMap.put(inValue.getAA(), codons); 196 } 197 198 codons.add(inValue); 199 200 return this; 201 } 202 203 //--------------------------------------------------------------------------- 204 /** 205 Can be used to retrieve the internal Codon object which can contain the AA mapping and codon usage data. 206 @param inCodon the string representation of the Codon object to retrieve 207 @return the Codon object corresponding to the specified nucleotide triplet 208 */ 209 public Codon getCodon(String inCodon) 210 { 211 Codon queryCodon = new Codon(inCodon); 212 Codon requestedCodon = null; 213 for (Codon codon : mCodons) 214 { 215 if (codon.equals(queryCodon)) 216 { 217 requestedCodon = codon; 218 break; 219 } 220 } 221 222 return requestedCodon; 223 } 224 225 //-------------------------------------------------------------------------- 226 public Set<Codon> getCodons() 227 { 228 return Collections.unmodifiableSet(mCodons); 229 } 230 231 //--------------------------------------------------------------------------- 232 public Set<Codon> getCodonsForAA(char inAA) 233 { 234 return mAAToCodonsMap.get(inAA); 235 } 236 237 //--------------------------------------------------------------------------- 238 /** 239 Returns a weighted randomly selected codon for the specified amino acid. 240 @param inAA the amino acid character for which codons should be selected 241 @return the randomly selected Codon 242 */ 243 public Codon getCodonForAA_viaWeightedSelection(char inAA) 244 { 245 return getCodonForAA_viaWeightedSelection(inAA, 0); 246 } 247 248 //--------------------------------------------------------------------------- 249 /** 250 Returns a weighted randomly selected codon meeting a specified minimum usage bias 251 for the specified amino acid. 252 @param inAA the amino acid character for which codons should be selected 253 @param inMinBias the minimum usage bias (frequency) for codons to be considered 254 @return the randomly selected Codon 255 */ 256 public Codon getCodonForAA_viaWeightedSelection(char inAA, float inMinBias) 257 { 258 // Force to uppercase 259 char aa = Character.toUpperCase(inAA); 260 261 float totalWeight = 0.0f; 262 263 Set<Codon> codons = mAAToCodonsMap.get(aa); 264 if (! CollectionUtil.hasValues(codons)) 265 { 266 throw new RuntimeException("No codons specified for amino acid " + StringUtil.singleQuote(inAA) + "!"); 267 } 268 269 for (Codon codon : codons) 270 { 271 if (codon.getCodonUsage().getBias() >= inMinBias) 272 { 273 totalWeight += codon.getCodonUsage().getBias(); 274 } 275 } 276 277 float randomNum = sRandom.nextFloat(); 278 279 Codon selectedCodon = null; 280 281 Range<Float> valueRange = new Range<Float>().setStart(0f); 282 for (Codon codon : codons) 283 { 284 if (codon.getCodonUsage().getBias() >= inMinBias) 285 { 286 valueRange.setEnd(valueRange.getStart() + (codon.getCodonUsage().getBias() / totalWeight)); 287 288 if (valueRange.contains(randomNum)) 289 { 290 selectedCodon = codon; 291 break; 292 } 293 294 valueRange.setStart(valueRange.getEnd()); 295 } 296 } 297 298 return selectedCodon; 299 } 300 301 //--------------------------------------------------------------------------- 302 public char translateCodon(String inCodon) 303 { 304 return translateCodon(new Codon(inCodon)); 305 } 306 307 //--------------------------------------------------------------------------- 308 public char translateCodon(Codon inCodon) 309 { 310 Character aa = mCodonToAAMap.get(inCodon); 311 return (aa != null ? aa : 'X'); 312 } 313 314 //--------------------------------------------------------------------------- 315 public boolean containsCodonUsageData() 316 { 317 boolean result = false; 318 if (mAAToCodonsMap.size() > 0) 319 { 320 result = (mAAToCodonsMap.values().iterator().next().iterator().next().getCodonUsage() != null); 321 } 322 323 return result; 324 } 325 326 //########################################################################### 327 // PRIVATE METHODS 328 //########################################################################### 329 330 //--------------------------------------------------------------------------- 331 // Calculate degenerate codons 332 private void init() 333 { 334 for (Character aa : mAAToCodonsMap.keySet()) 335 { 336 Set<Codon> codonSet = mAAToCodonsMap.get(aa); 337 338 Map<Nucleotide, Map<Nucleotide, Set<Nucleotide>>> codonNucleotideMap = new HashMap<>(2); 339 340 for (Codon codon : codonSet) 341 { 342 Nucleotide pos1Nuc = Nucleotide.valueOf(codon.toString().charAt(0)); 343 Nucleotide pos2Nuc = Nucleotide.valueOf(codon.toString().charAt(1)); 344 Nucleotide pos3Nuc = Nucleotide.valueOf(codon.toString().charAt(2)); 345 346 Map<Nucleotide, Set<Nucleotide>> pos2Map = codonNucleotideMap.get(pos1Nuc); 347 if (null == pos2Map) 348 { 349 pos2Map = new HashMap<>(3); 350 codonNucleotideMap.put(pos1Nuc, pos2Map); 351 } 352 353 Set<Nucleotide> pos3Set = pos2Map.get(pos2Nuc); 354 if (null == pos3Set) 355 { 356 pos3Set = new HashSet<>(4); 357 pos2Map.put(pos2Nuc, pos3Set); 358 } 359 360 pos3Set.add(pos3Nuc); 361 } 362 363 for (Nucleotide pos1Nuc : codonNucleotideMap.keySet()) 364 { 365 Map<Nucleotide, Set<Nucleotide>> pos2Map = codonNucleotideMap.get(pos1Nuc); 366 for (Nucleotide pos2Nuc : pos2Map.keySet()) 367 { 368 Set<Nucleotide> pos3Set = pos2Map.get(pos2Nuc); 369 370 for (Nucleotide degenerateNuc : Nucleotide.degenerateValues()) 371 { 372 boolean matches = true; 373 for (Nucleotide base : degenerateNuc.getDegeneracy()) 374 { 375 if (!pos3Set.contains(base)) 376 { 377 matches = false; 378 break; 379 } 380 } 381 382 if (matches) 383 { 384 Codon degenerateCodon = new Codon(("" + pos1Nuc.getOneLetterCode() + pos2Nuc.getOneLetterCode() + degenerateNuc.getOneLetterCode()).toUpperCase()); 385 386 // Not calling addCodon() because we don't want to add degenerate codons to mAAToCodonMap. 387 mCodons.add(degenerateCodon); 388 389 mCodonToAAMap.put(degenerateCodon, aa); 390 } 391 } 392 } 393 } 394 } 395 } 396 397 //--------------------------------------------------------------------------- 398 private void fromXMLTag(XMLTag inXMLTag) 399 { 400 inXMLTag.verifyTagName(XML_CODON_TABLE); 401 402 String name = inXMLTag.getAttributeValue(XML_NAME_ATT); 403 if (StringUtil.isSet(name)) 404 { 405 setName(name); 406 } 407 408 String species = inXMLTag.getAttributeValue(XML_SPECIES_ATT); 409 if (StringUtil.isSet(species)) 410 { 411 // TODO: Use the taxon id instead? 412 Set<NCBITaxon> taxons = NCBITaxon.getByName(species); 413 if (CollectionUtil.hasValues(taxons)) 414 { 415 mTaxon = taxons.iterator().next(); 416 } 417 } 418 419 List<XMLTag> codonTags = inXMLTag.getSubtagsByName(Codon.XML_CODON); 420 if (CollectionUtil.hasValues(codonTags)) 421 { 422 for (XMLTag codonTag : codonTags) 423 { 424 addCodon(new Codon(codonTag)); 425 } 426 } 427 428 if (containsCodonUsageData()) 429 { 430 // Is bias data present? 431 CodonUsage codonUsage = mCodonToAAMap.keySet().iterator().next().getCodonUsage(); 432 if (null == codonUsage.getBias()) 433 { 434 calculateBiasValues(); 435 } 436 } 437 } 438 439 //--------------------------------------------------------------------------- 440 private void clearCodonData() 441 { 442 mCodons.clear(); 443 mAAToCodonsMap.clear(); 444 mCodonToAAMap.clear(); 445 } 446 447 //--------------------------------------------------------------------------- 448 private void parseKazusaData(InputStream inContent) 449 throws Exception 450 { 451 clearCodonData(); 452 453 BufferedReader reader = new BufferedReader(new InputStreamReader(inContent)); 454 455 String line; 456 while ((line = reader.readLine()) != null) 457 { 458 line = line.trim(); 459 if (StringUtil.isSet(line)) 460 { 461 Matcher m = sKazusaLinePattern.matcher(line); 462 int offset = 0; 463 while (m.find(offset)) 464 { 465 Codon codon = new Codon(m.group(1)).setAA(m.group(2).charAt(0)); 466 CodonUsage usage = new CodonUsage(); 467 usage.setBias(Float.parseFloat(m.group(3))); 468 usage.setFreqPer1000(Float.parseFloat(m.group(4))); 469 usage.setNumber(Integer.parseInt(m.group(5))); 470 471 codon.setCodonUsage(usage); 472 473 addCodon(codon); 474 if (m.end() + 1 < line.length()) 475 { 476 offset = m.end() + 1; 477 } 478 else 479 { 480 break; 481 } 482 } 483 } 484 } 485 486 reader.close(); 487 488 if (mCodons.size() != 64) 489 { 490 throw new RuntimeException("Problem retrieving codon data. " + mCodons.size() + " codons were parsed instead of 64!"); 491 } 492 } 493 494 //--------------------------------------------------------------------------- 495 private void calculateBiasValues() 496 { 497 for (Character aa : mAAToCodonsMap.keySet()) 498 { 499 Set<Codon> codons = mAAToCodonsMap.get(aa); 500 int totalNum = 0; 501 for (Codon codon : codons) 502 { 503 totalNum += codon.getCodonUsage().getNumber(); 504 } 505 506 for (Codon codon : codons) 507 { 508 float bias = codon.getCodonUsage().getNumber() / (float) totalNum; 509 codon.getCodonUsage().setBias(bias); 510 } 511 } 512 } 513 514}