001package com.hfg.bio.seq;
002
003
004import java.util.Arrays;
005
006
007//------------------------------------------------------------------------------
008/**
009 * Nucleotide quality trimmer based on Phred's modified-Mott trimming algorithm.
010 *
011 * @author J. Alex Taylor, hairyfatguy.com
012 */
013//------------------------------------------------------------------------------
014// com.hfg XML/HTML Coding Library
015//
016// This library is free software; you can redistribute it and/or
017// modify it under the terms of the GNU Lesser General Public
018// License as published by the Free Software Foundation; either
019// version 2.1 of the License, or (at your option) any later version.
020//
021// This library is distributed in the hope that it will be useful,
022// but WITHOUT ANY WARRANTY; without even the implied warranty of
023// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
024// Lesser General Public License for more details.
025//
026// You should have received a copy of the GNU Lesser General Public
027// License along with this library; if not, write to the Free Software
028// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
029//
030// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
031// jataylor@hairyfatguy.com
032//------------------------------------------------------------------------------
033
034public class NucleicAcidTrimmer
035{
036   public static final String TRIM_RANGE = "trimRange";
037
038   private int   mMinLength = 20;
039   private float mThreshold = 0.05f;
040
041   private double[] mQualityToProbability;
042
043   //###########################################################################
044   // CONSTRUCTORS
045   //###########################################################################
046
047   //--------------------------------------------------------------------------
048   public NucleicAcidTrimmer()
049   {
050   }
051
052   //###########################################################################
053   // PUBLIC METHODS
054   //###########################################################################
055
056   //--------------------------------------------------------------------------
057   public int getMinLength()
058   {
059      return mMinLength;
060   }
061
062   //--------------------------------------------------------------------------
063   public NucleicAcidTrimmer setMinLength(int inValue)
064   {
065      mMinLength = inValue;
066      return this;
067   }
068
069
070   //--------------------------------------------------------------------------
071   public float getProbabilityThreshold()
072   {
073      return mThreshold;
074   }
075
076   //--------------------------------------------------------------------------
077   public NucleicAcidTrimmer setProbabilityThreshold(float inValue)
078   {
079      mThreshold = inValue;
080      return this;
081   }
082
083
084   //--------------------------------------------------------------------------
085   public SeqLocation calculateTrimRange(NucleicAcid inSeq)
086   {
087      SeqLocation trimRange = null;
088
089      // Don't bother trimming if the starting length is less than the min length
090      // or it doesn't have quality scores
091      if (inSeq.length() >= mMinLength
092            && inSeq.getSeqQualityScores() != null)
093      {
094         SeqQualityScores quality = inSeq.getSeqQualityScores();
095         double score = 0;
096         double maxScore = 0;
097         int begin_1based = 0;
098         int end_1based = 0;
099         int tmp = 0;
100         for (int i = 0; i < inSeq.length(); i++)
101         {
102            short qualityScore = quality.getQualityScores()[i];
103
104            score += mThreshold - qualityToProbability(qualityScore);
105
106            if (score > maxScore)
107            {
108               maxScore = score;
109               begin_1based = tmp + 1;
110               end_1based = i + 1;
111            }
112            else if (score < 0)
113            {
114               score = 0;
115               tmp = i + 1;
116            }
117         }
118
119         if (end_1based - begin_1based + 1 >= mMinLength)
120         {
121            trimRange = new SeqLocation(begin_1based, end_1based);
122         }
123      }
124
125      return trimRange;
126   }
127
128   //--------------------------------------------------------------------------
129   public NucleicAcid trim(NucleicAcid inSeq)
130   {
131      NucleicAcid trimmedSeq = null;
132
133      SeqLocation trimRange = calculateTrimRange(inSeq);
134      if (trimRange != null)
135      {
136         trimmedSeq = inSeq.clone()
137               .setSequence(inSeq.getSubSequence(trimRange));
138
139         // Trim the quality score to match the new trimmed seq
140         trimmedSeq.setSeqQualityScores(new SeqQualityScores(Arrays.copyOfRange(trimmedSeq.getSeqQualityScores().getQualityScores(), trimRange.getStart() - 1,
141               trimRange.getEnd()), trimmedSeq.getSeqQualityScores().getScheme()));
142
143
144         trimmedSeq.setAttribute(TRIM_RANGE, trimRange);
145      }
146
147      return trimmedSeq;
148   }
149
150   //###########################################################################
151   // PRIVATE METHODS
152   //###########################################################################
153
154   //--------------------------------------------------------------------------
155   // Quality scores are assumed to be = -10log10(probability)
156   private double qualityToProbability(short inQuality)
157   {
158      if (null == mQualityToProbability)
159      {
160         double[] values = new double[255];
161         for (int i = 0; i < 255; i++)
162         {
163            values[i] = Math.pow(10.0, i / -10.0);
164         }
165
166         mQualityToProbability = values;
167      }
168
169      return mQualityToProbability[inQuality];
170   }
171}