001package com.hfg.bio.seq.translation;
002
003import java.io.BufferedReader;
004import java.io.InputStream;
005import java.io.InputStreamReader;
006import java.net.URL;
007import java.net.URLConnection;
008import java.util.*;
009import java.util.regex.Matcher;
010import java.util.regex.Pattern;
011import java.util.zip.GZIPInputStream;
012
013import com.hfg.bio.Nucleotide;
014import com.hfg.bio.taxonomy.ncbi.NCBITaxon;
015import com.hfg.exception.ProgrammingException;
016import com.hfg.math.Range;
017import com.hfg.util.collection.CollectionUtil;
018import com.hfg.util.collection.OrderedSet;
019import com.hfg.util.StringUtil;
020import com.hfg.util.io.StreamUtil;
021import com.hfg.xml.HfgXML;
022import com.hfg.xml.XMLName;
023import com.hfg.xml.XMLTag;
024
025//------------------------------------------------------------------------------
026/**
027 Table that maps codons to their respective amino acids.
028 <div>
029  @author J. Alex Taylor, hairyfatguy.com
030 </div>
031 */
032//------------------------------------------------------------------------------
033// com.hfg Library
034//
035// This library is free software; you can redistribute it and/or
036// modify it under the terms of the GNU Lesser General Public
037// License as published by the Free Software Foundation; either
038// version 2.1 of the License, or (at your option) any later version.
039//
040// This library is distributed in the hope that it will be useful,
041// but WITHOUT ANY WARRANTY; without even the implied warranty of
042// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
043// Lesser General Public License for more details.
044//
045// You should have received a copy of the GNU Lesser General Public
046// License along with this library; if not, write to the Free Software
047// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
048//
049// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
050// jataylor@hairyfatguy.com
051//------------------------------------------------------------------------------
052
053public class CodonTable implements TranslationTable
054{
055
056   // XML Tag names
057   public static final XMLName XML_CODON_TABLE    = new XMLName("CodonTable", HfgXML.HFG_NAMESPACE);
058
059   // XML Attribute names
060   public static final XMLName XML_NAME_ATT       = new XMLName("name",  HfgXML.HFG_NAMESPACE);
061   public static final XMLName XML_SPECIES_ATT    = new XMLName("species",  HfgXML.HFG_NAMESPACE);
062
063
064   public static final CodonTable HUMAN = new CodonTable("codondata/human.cod.xml.gz");
065   public static final CodonTable MOUSE = new CodonTable("codondata/mouse.cod.xml.gz");
066
067
068   private static final String sKazusaUrlTemplate = "http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=%d&aa=1&style=N";
069
070   private static final Pattern sKazusaLinePattern = Pattern.compile("\\s*(\\w{3})\\s+([\\w\\*])\\s+([\\.\\d]+)\\s+([\\.\\d]+)\\s+\\(\\s*(\\d+)\\)");
071
072   private static Random sRandom = new Random(System.currentTimeMillis());
073
074   private String    mName;
075   private NCBITaxon mTaxon;
076
077   private Set<Codon>                 mCodons = new OrderedSet<>(65);
078   private Map<Codon, Character>      mCodonToAAMap = new HashMap<>(65);
079   private Map<Character, Set<Codon>> mAAToCodonsMap = new HashMap<>(26);
080
081   //###########################################################################
082   // CONSTRUCTORS
083   //###########################################################################
084
085   //---------------------------------------------------------------------------
086   public CodonTable(NCBITaxon inSpecies)
087   {
088      mTaxon = inSpecies;
089   }
090
091   //---------------------------------------------------------------------------
092   public CodonTable(XMLTag inXMLTag)
093   {
094      fromXMLTag(inXMLTag);
095      init();
096   }
097
098   //---------------------------------------------------------------------------
099   protected CodonTable(String inRsrcPath)
100   {
101      InputStream stream = getClass().getResourceAsStream(inRsrcPath);
102      if (null == stream)
103      {
104         throw new ProgrammingException("The rsrc " + inRsrcPath + " couldn't be found!?");
105      }
106
107      try
108      {
109         if (inRsrcPath.endsWith(".gz"))
110         {
111            stream = new GZIPInputStream(stream);
112         }
113
114         fromXMLTag(new XMLTag(stream));
115
116         StreamUtil.close(stream);
117      }
118      catch (Exception e)
119      {
120         throw new ProgrammingException(e);
121      }
122
123      init();
124   }
125
126   //###########################################################################
127   // PUBLIC METHODS
128   //###########################################################################
129
130   //---------------------------------------------------------------------------
131   public String name()
132   {
133      return mName;
134   }
135
136   //---------------------------------------------------------------------------
137   private CodonTable setName(String inValue)
138   {
139      mName = inValue;
140      return this;
141   }
142
143   //---------------------------------------------------------------------------
144   /**
145    Dynamically retrieves codon usage data from the Codon Usage Database
146    (<a href='http://www.kazusa.or.jp/codon/'>http://www.kazusa.or.jp/codon/</a>)
147    @throws Exception if the data cannot be retrieved
148    */
149   public void retrieveDataFromKazusa()
150      throws Exception
151   {
152      URL url = new URL(String.format(sKazusaUrlTemplate, mTaxon.getTaxonId()));
153
154      URLConnection connection = url.openConnection();
155      parseKazusaData(connection.getInputStream());
156   }
157
158   //---------------------------------------------------------------------------
159   public XMLTag toXMLTag()
160   {
161      XMLTag tag = new XMLTag(XML_CODON_TABLE);
162
163      if (name() != null)
164      {
165         tag.setAttribute(XML_NAME_ATT, name());
166      }
167
168      if (mTaxon != null)
169      {
170         tag.setAttribute(XML_SPECIES_ATT, mTaxon.getScientificName());
171      }
172
173      if (mCodons != null)
174      {
175         for (Codon codon : mCodons)
176         {
177            tag.addSubtag(codon.toXMLTag());
178         }
179      }
180
181      return tag;
182   }
183
184   //---------------------------------------------------------------------------
185   public CodonTable addCodon(Codon inValue)
186   {
187      mCodons.add(inValue);
188
189      mCodonToAAMap.put(inValue, inValue.getAA());
190
191      Set<Codon> codons = mAAToCodonsMap.get(inValue.getAA());
192      if (null == codons)
193      {
194         codons = new HashSet<>(10);
195         mAAToCodonsMap.put(inValue.getAA(), codons);
196      }
197
198      codons.add(inValue);
199
200      return this;
201   }
202
203   //---------------------------------------------------------------------------
204   /**
205    Can be used to retrieve the internal Codon object which can contain the AA mapping and codon usage data.
206    @param inCodon the string representation of the Codon object to retrieve
207    @return the Codon object corresponding to the specified nucleotide triplet
208    */
209   public Codon getCodon(String inCodon)
210   {
211      Codon queryCodon = new Codon(inCodon);
212      Codon requestedCodon = null;
213      for (Codon codon : mCodons)
214      {
215         if (codon.equals(queryCodon))
216         {
217            requestedCodon = codon;
218            break;
219         }
220      }
221
222      return requestedCodon;
223   }
224
225   //--------------------------------------------------------------------------
226   public Set<Codon> getCodons()
227   {
228      return Collections.unmodifiableSet(mCodons);
229   }
230
231   //---------------------------------------------------------------------------
232   public Set<Codon> getCodonsForAA(char inAA)
233   {
234      return mAAToCodonsMap.get(inAA);
235   }
236
237   //---------------------------------------------------------------------------
238   /**
239    Returns a weighted randomly selected codon for the specified amino acid.
240    @param inAA the amino acid character for which codons should be selected
241    @return the randomly selected Codon
242    */
243   public Codon getCodonForAA_viaWeightedSelection(char inAA)
244   {
245      return getCodonForAA_viaWeightedSelection(inAA, 0);
246   }
247
248   //---------------------------------------------------------------------------
249   /**
250    Returns a weighted randomly selected codon meeting a specified minimum usage bias
251    for the specified amino acid.
252    @param inAA the amino acid character for which codons should be selected
253    @param inMinBias the minimum usage bias (frequency) for codons to be considered
254    @return the randomly selected Codon
255    */
256   public Codon getCodonForAA_viaWeightedSelection(char inAA, float inMinBias)
257   {
258      // Force to uppercase
259      char aa = Character.toUpperCase(inAA);
260
261      float totalWeight = 0.0f;
262
263      Set<Codon> codons = mAAToCodonsMap.get(aa);
264      if (! CollectionUtil.hasValues(codons))
265      {
266         throw new RuntimeException("No codons specified for amino acid " + StringUtil.singleQuote(inAA) + "!");
267      }
268
269      for (Codon codon : codons)
270      {
271         if (codon.getCodonUsage().getBias() >= inMinBias)
272         {
273            totalWeight += codon.getCodonUsage().getBias();
274         }
275      }
276
277      float randomNum = sRandom.nextFloat();
278
279      Codon selectedCodon = null;
280
281      Range<Float> valueRange = new Range<Float>().setStart(0f);
282      for (Codon codon : codons)
283      {
284         if (codon.getCodonUsage().getBias() >= inMinBias)
285         {
286            valueRange.setEnd(valueRange.getStart() + (codon.getCodonUsage().getBias() / totalWeight));
287
288            if (valueRange.contains(randomNum))
289            {
290               selectedCodon = codon;
291               break;
292            }
293
294            valueRange.setStart(valueRange.getEnd());
295         }
296      }
297
298      return selectedCodon;
299   }
300
301   //---------------------------------------------------------------------------
302   public char translateCodon(String inCodon)
303   {
304      return translateCodon(new Codon(inCodon));
305   }
306
307   //---------------------------------------------------------------------------
308   public char translateCodon(Codon inCodon)
309   {
310      Character aa = mCodonToAAMap.get(inCodon);
311      return (aa != null ? aa : 'X');
312   }
313
314   //---------------------------------------------------------------------------
315   public boolean containsCodonUsageData()
316   {
317      boolean result = false;
318      if (mAAToCodonsMap.size() > 0)
319      {
320         result = (mAAToCodonsMap.values().iterator().next().iterator().next().getCodonUsage() != null);
321      }
322      
323      return result;
324   }
325
326   //###########################################################################
327   // PRIVATE METHODS
328   //###########################################################################
329
330   //---------------------------------------------------------------------------
331   // Calculate degenerate codons
332   private void init()
333   {
334      for (Character aa : mAAToCodonsMap.keySet())
335      {
336         Set<Codon> codonSet = mAAToCodonsMap.get(aa);
337
338         Map<Nucleotide, Map<Nucleotide, Set<Nucleotide>>> codonNucleotideMap = new HashMap<>(2);
339
340         for (Codon codon : codonSet)
341         {
342            Nucleotide pos1Nuc = Nucleotide.valueOf(codon.toString().charAt(0));
343            Nucleotide pos2Nuc = Nucleotide.valueOf(codon.toString().charAt(1));
344            Nucleotide pos3Nuc = Nucleotide.valueOf(codon.toString().charAt(2));
345
346            Map<Nucleotide, Set<Nucleotide>> pos2Map = codonNucleotideMap.get(pos1Nuc);
347            if (null == pos2Map)
348            {
349               pos2Map = new HashMap<>(3);
350               codonNucleotideMap.put(pos1Nuc, pos2Map);
351            }
352
353            Set<Nucleotide> pos3Set = pos2Map.get(pos2Nuc);
354            if (null == pos3Set)
355            {
356               pos3Set = new HashSet<>(4);
357               pos2Map.put(pos2Nuc, pos3Set);
358            }
359
360            pos3Set.add(pos3Nuc);
361         }
362
363         for (Nucleotide pos1Nuc : codonNucleotideMap.keySet())
364         {
365            Map<Nucleotide, Set<Nucleotide>> pos2Map = codonNucleotideMap.get(pos1Nuc);
366            for (Nucleotide pos2Nuc : pos2Map.keySet())
367            {
368               Set<Nucleotide> pos3Set = pos2Map.get(pos2Nuc);
369
370               for (Nucleotide degenerateNuc : Nucleotide.degenerateValues())
371               {
372                  boolean matches = true;
373                  for (Nucleotide base : degenerateNuc.getDegeneracy())
374                  {
375                     if (!pos3Set.contains(base))
376                     {
377                        matches = false;
378                        break;
379                     }
380                  }
381
382                  if (matches)
383                  {
384                     Codon degenerateCodon = new Codon(("" + pos1Nuc.getOneLetterCode() + pos2Nuc.getOneLetterCode() + degenerateNuc.getOneLetterCode()).toUpperCase());
385
386                     // Not calling addCodon() because we don't want to add degenerate codons to mAAToCodonMap.
387                     mCodons.add(degenerateCodon);
388
389                     mCodonToAAMap.put(degenerateCodon, aa);
390                  }
391               }
392            }
393         }
394      }
395   }
396
397   //---------------------------------------------------------------------------
398   private void fromXMLTag(XMLTag inXMLTag)
399   {
400      inXMLTag.verifyTagName(XML_CODON_TABLE);
401
402      String name = inXMLTag.getAttributeValue(XML_NAME_ATT);
403      if (StringUtil.isSet(name))
404      {
405         setName(name);
406      }
407
408      String species = inXMLTag.getAttributeValue(XML_SPECIES_ATT);
409      if (StringUtil.isSet(species))
410      {
411         // TODO: Use the taxon id instead?
412         Set<NCBITaxon> taxons = NCBITaxon.getByName(species);
413         if (CollectionUtil.hasValues(taxons))
414         {
415            mTaxon = taxons.iterator().next();
416         }
417      }
418
419      List<XMLTag> codonTags = inXMLTag.getSubtagsByName(Codon.XML_CODON);
420      if (CollectionUtil.hasValues(codonTags))
421      {
422         for (XMLTag codonTag : codonTags)
423         {
424            addCodon(new Codon(codonTag));
425         }
426      }
427
428      if (containsCodonUsageData())
429      {
430         // Is bias data present?
431         CodonUsage codonUsage = mCodonToAAMap.keySet().iterator().next().getCodonUsage();
432         if (null == codonUsage.getBias())
433         {
434            calculateBiasValues();
435         }
436      }
437   }
438
439   //---------------------------------------------------------------------------
440   private void clearCodonData()
441   {
442      mCodons.clear();
443      mAAToCodonsMap.clear();
444      mCodonToAAMap.clear();
445   }
446
447   //---------------------------------------------------------------------------
448   private void parseKazusaData(InputStream inContent)
449         throws Exception
450   {
451      clearCodonData();
452
453      BufferedReader reader = new BufferedReader(new InputStreamReader(inContent));
454
455      String line;
456      while ((line = reader.readLine()) != null)
457      {
458         line = line.trim();
459         if (StringUtil.isSet(line))
460         {
461            Matcher m = sKazusaLinePattern.matcher(line);
462            int offset = 0;
463            while (m.find(offset))
464            {
465               Codon codon = new Codon(m.group(1)).setAA(m.group(2).charAt(0));
466               CodonUsage usage = new CodonUsage();
467               usage.setBias(Float.parseFloat(m.group(3)));
468               usage.setFreqPer1000(Float.parseFloat(m.group(4)));
469               usage.setNumber(Integer.parseInt(m.group(5)));
470
471               codon.setCodonUsage(usage);
472
473               addCodon(codon);
474               if (m.end() + 1 < line.length())
475               {
476                  offset = m.end() + 1;
477               }
478               else
479               {
480                  break;
481               }
482            }
483         }
484      }
485
486      reader.close();
487
488      if (mCodons.size() != 64)
489      {
490         throw new RuntimeException("Problem retrieving codon data. " + mCodons.size() + " codons were parsed instead of 64!");
491      }
492   }
493
494   //---------------------------------------------------------------------------
495   private void calculateBiasValues()
496   {
497      for (Character aa : mAAToCodonsMap.keySet())
498      {
499         Set<Codon> codons = mAAToCodonsMap.get(aa);
500         int totalNum = 0;
501         for (Codon codon : codons)
502         {
503            totalNum += codon.getCodonUsage().getNumber();
504         }
505
506         for (Codon codon : codons)
507         {
508            float bias = codon.getCodonUsage().getNumber() / (float) totalNum;
509            codon.getCodonUsage().setBias(bias);
510         }
511      }
512   }
513
514}