001package com.hfg.bio.seq; 002 003 004import java.util.Arrays; 005 006 007//------------------------------------------------------------------------------ 008/** 009 * Nucleotide quality trimmer based on Phred's modified-Mott trimming algorithm. 010 * 011 * @author J. Alex Taylor, hairyfatguy.com 012 */ 013//------------------------------------------------------------------------------ 014// com.hfg XML/HTML Coding Library 015// 016// This library is free software; you can redistribute it and/or 017// modify it under the terms of the GNU Lesser General Public 018// License as published by the Free Software Foundation; either 019// version 2.1 of the License, or (at your option) any later version. 020// 021// This library is distributed in the hope that it will be useful, 022// but WITHOUT ANY WARRANTY; without even the implied warranty of 023// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 024// Lesser General Public License for more details. 025// 026// You should have received a copy of the GNU Lesser General Public 027// License along with this library; if not, write to the Free Software 028// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 029// 030// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 031// jataylor@hairyfatguy.com 032//------------------------------------------------------------------------------ 033 034public class NucleicAcidTrimmer 035{ 036 public static final String TRIM_RANGE = "trimRange"; 037 038 private int mMinLength = 20; 039 private float mThreshold = 0.05f; 040 041 private double[] mQualityToProbability; 042 043 //########################################################################### 044 // CONSTRUCTORS 045 //########################################################################### 046 047 //-------------------------------------------------------------------------- 048 public NucleicAcidTrimmer() 049 { 050 } 051 052 //########################################################################### 053 // PUBLIC METHODS 054 //########################################################################### 055 056 //-------------------------------------------------------------------------- 057 public int getMinLength() 058 { 059 return mMinLength; 060 } 061 062 //-------------------------------------------------------------------------- 063 public NucleicAcidTrimmer setMinLength(int inValue) 064 { 065 mMinLength = inValue; 066 return this; 067 } 068 069 070 //-------------------------------------------------------------------------- 071 public float getProbabilityThreshold() 072 { 073 return mThreshold; 074 } 075 076 //-------------------------------------------------------------------------- 077 public NucleicAcidTrimmer setProbabilityThreshold(float inValue) 078 { 079 mThreshold = inValue; 080 return this; 081 } 082 083 084 //-------------------------------------------------------------------------- 085 public SeqLocation calculateTrimRange(NucleicAcid inSeq) 086 { 087 SeqLocation trimRange = null; 088 089 // Don't bother trimming if the starting length is less than the min length 090 // or it doesn't have quality scores 091 if (inSeq.length() >= mMinLength 092 && inSeq.getSeqQualityScores() != null) 093 { 094 SeqQualityScores quality = inSeq.getSeqQualityScores(); 095 double score = 0; 096 double maxScore = 0; 097 int begin_1based = 0; 098 int end_1based = 0; 099 int tmp = 0; 100 for (int i = 0; i < inSeq.length(); i++) 101 { 102 short qualityScore = quality.getQualityScores()[i]; 103 104 score += mThreshold - qualityToProbability(qualityScore); 105 106 if (score > maxScore) 107 { 108 maxScore = score; 109 begin_1based = tmp + 1; 110 end_1based = i + 1; 111 } 112 else if (score < 0) 113 { 114 score = 0; 115 tmp = i + 1; 116 } 117 } 118 119 if (end_1based - begin_1based + 1 >= mMinLength) 120 { 121 trimRange = new SeqLocation(begin_1based, end_1based); 122 } 123 } 124 125 return trimRange; 126 } 127 128 //-------------------------------------------------------------------------- 129 public NucleicAcid trim(NucleicAcid inSeq) 130 { 131 NucleicAcid trimmedSeq = null; 132 133 SeqLocation trimRange = calculateTrimRange(inSeq); 134 if (trimRange != null) 135 { 136 trimmedSeq = inSeq.clone() 137 .setSequence(inSeq.getSubSequence(trimRange)); 138 139 // Trim the quality score to match the new trimmed seq 140 trimmedSeq.setSeqQualityScores(new SeqQualityScores(Arrays.copyOfRange(trimmedSeq.getSeqQualityScores().getQualityScores(), trimRange.getStart() - 1, 141 trimRange.getEnd()), trimmedSeq.getSeqQualityScores().getScheme())); 142 143 144 trimmedSeq.setAttribute(TRIM_RANGE, trimRange); 145 } 146 147 return trimmedSeq; 148 } 149 150 //########################################################################### 151 // PRIVATE METHODS 152 //########################################################################### 153 154 //-------------------------------------------------------------------------- 155 // Quality scores are assumed to be = -10log10(probability) 156 private double qualityToProbability(short inQuality) 157 { 158 if (null == mQualityToProbability) 159 { 160 double[] values = new double[255]; 161 for (int i = 0; i < 255; i++) 162 { 163 values[i] = Math.pow(10.0, i / -10.0); 164 } 165 166 mQualityToProbability = values; 167 } 168 169 return mQualityToProbability[inQuality]; 170 } 171}