001package com.hfg.bio.seq.pattern; 002 003import java.util.ArrayList; 004import java.util.Arrays; 005import java.util.Collection; 006import java.util.List; 007import java.util.regex.Matcher; 008import java.util.regex.Pattern; 009 010import com.hfg.bio.Nucleotide; 011import com.hfg.bio.Strand; 012import com.hfg.bio.seq.BioSequenceType; 013import com.hfg.bio.seq.NucleicAcid; 014import com.hfg.bio.seq.SeqLocation; 015import com.hfg.util.StringBuilderPlus; 016import com.hfg.util.StringUtil; 017 018//------------------------------------------------------------------------------ 019/** 020 Container for a nucleotide pattern (motif). 021 022 <div> 023 @author J. Alex Taylor, hairyfatguy.com 024 </div> 025 */ 026//------------------------------------------------------------------------------ 027// com.hfg Library 028// 029// This library is free software; you can redistribute it and/or 030// modify it under the terms of the GNU Lesser General Public 031// License as published by the Free Software Foundation; either 032// version 2.1 of the License, or (at your option) any later version. 033// 034// This library is distributed in the hope that it will be useful, 035// but WITHOUT ANY WARRANTY; without even the implied warranty of 036// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 037// Lesser General Public License for more details. 038// 039// You should have received a copy of the GNU Lesser General Public 040// License along with this library; if not, write to the Free Software 041// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 042// 043// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 044// jataylor@hairyfatguy.com 045//------------------------------------------------------------------------------ 046 047public class NucleotidePattern<T extends NucleotidePatternMatch> extends SeqPattern<NucleicAcid, T> 048{ 049 private List<Strand> mStrandsToSearch = Arrays.asList(Strand.values()); // Default to searching both strands 050 private Pattern mAlternateStrandPattern; 051 052 //########################################################################### 053 // CONSTRUCTORS 054 //########################################################################### 055 056 //-------------------------------------------------------------------------- 057 protected NucleotidePattern() 058 { 059 } 060 061 //-------------------------------------------------------------------------- 062 public NucleotidePattern(String inPatternString) 063 { 064 super(inPatternString.toUpperCase()); 065 } 066 067 //########################################################################### 068 // PUBLIC METHODS 069 //########################################################################### 070 071 //-------------------------------------------------------------------------- 072 public NucleotidePattern clone() 073 { 074 NucleotidePattern cloneObj = (NucleotidePattern) super.clone(); 075 076 if (mStrandsToSearch != null) 077 { 078 cloneObj.mStrandsToSearch = new ArrayList(mStrandsToSearch); 079 } 080 081 return cloneObj; 082 } 083 084 //-------------------------------------------------------------------------- 085 public BioSequenceType getBioSequenceType() 086 { 087 return BioSequenceType.NUCLEIC_ACID; 088 } 089 090 //-------------------------------------------------------------------------- 091 @Override 092 public NucleotidePattern setIgnoreGaps(boolean inValue) 093 { 094 return (NucleotidePattern) super.setIgnoreGaps(inValue); 095 } 096 097 //-------------------------------------------------------------------------- 098 @Override 099 public NucleotidePattern setMaxMismatches(int inValue) 100 { 101 return (NucleotidePattern) super.setMaxMismatches(inValue); 102 } 103 104 //-------------------------------------------------------------------------- 105 public NucleotidePattern setStrandsToSearch(Strand inValue) 106 { 107 mStrandsToSearch = new ArrayList<>(1); 108 mStrandsToSearch.add(inValue); 109 return this; 110 } 111 112 //-------------------------------------------------------------------------- 113 public NucleotidePattern setStrandsToSearch(Strand[] inValues) 114 { 115 mStrandsToSearch = Arrays.asList(inValues); 116 return this; 117 } 118 119 //-------------------------------------------------------------------------- 120 public Collection<Strand> getStrandsToSearch() 121 { 122 return mStrandsToSearch; 123 } 124 125 126 //-------------------------------------------------------------------------- 127 @Override 128 protected T createMatch(String inSeq, SeqLocation inLocation) 129 { 130 return (T) new NucleotidePatternMatch(this, inSeq, inLocation); 131 } 132 133 //########################################################################### 134 // PROTECTED METHODS 135 //########################################################################### 136 137 //-------------------------------------------------------------------------- 138 protected String convertStringToRegExp(String inPrositePattern) 139 { 140 // Remove the period at the end 141 if (inPrositePattern.endsWith(".")) 142 { 143 inPrositePattern = inPrositePattern.substring(0, inPrositePattern.length() - 1); 144 } 145 146 StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getIgnoreGaps() ? "\\-*" : ""); 147 148 // Dashes separate positions 149 String[] positions = inPrositePattern.split("\\-"); 150 for (String position : positions) 151 { 152 boolean nTerm = false; 153 boolean cTerm = false; 154 if (position.startsWith("<")) // 5'? 155 { 156 nTerm = true; 157 position = position.substring(1); 158 } 159 160 if (position.endsWith(">")) // 3'? 161 { 162 cTerm = true; 163 position = position.substring(0, position.length() - 1); 164 } 165 166 // Extract the count spec if present 167 String countSpec = null; 168 Matcher m = PROSITE_COUNT_PATTERN.matcher(position); 169 if (m.find()) 170 { 171 countSpec = "{" + m.group(1) + "}"; 172 position = position.substring(0, m.start(1) - 1); 173 } 174 175 if (position.startsWith("{") 176 && position.endsWith("}")) 177 { 178 StringBuilder positionBuffer = new StringBuilder("[^"); 179 for (int i = 1; i < position.length() - 1; i++) 180 { 181 char theChar = position.charAt(i); 182 183 Nucleotide base = Nucleotide.valueOf(theChar); 184 if (null == base 185 && theChar != '<' 186 && theChar != '>') 187 { 188 throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!"); 189 } 190 191 positionBuffer.append(theChar); 192 if (base != null 193 && base.isAmbiguous()) 194 { 195 for (Nucleotide baseOption : base.getDegeneracy()) 196 { 197 positionBuffer.append(baseOption.getOneLetterCode()); 198 } 199 } 200 } 201 202 if (getIgnoreGaps()) 203 { 204 positionBuffer.append("\\-"); 205 } 206 207 positionBuffer.append("]"); 208 209 if (getIgnoreGaps() 210 && countSpec != null) 211 { 212 positionBuffer.insert(0, "(?:\\-*"); 213 positionBuffer.append(")"); 214 } 215 216 position = positionBuffer.toString(); 217 } 218 else if (position.startsWith("[") 219 && position.endsWith("]")) 220 { 221 StringBuilder positionBuffer = new StringBuilder("["); 222 for (int i = 1; i < position.length() - 1; i++) 223 { 224 char theChar = position.charAt(i); 225 226 Nucleotide base = Nucleotide.valueOf(theChar); 227 if (null == base 228 && theChar != '<' 229 && theChar != '>') 230 { 231 throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!"); 232 } 233 234 positionBuffer.append(theChar); 235 if (base != null 236 && base.isAmbiguous()) 237 { 238 for (Nucleotide baseOption : base.getDegeneracy()) 239 { 240 positionBuffer.append(baseOption.getOneLetterCode()); 241 } 242 } 243 } 244 245 positionBuffer.append("]"); 246 247 if (getIgnoreGaps() 248 && countSpec != null) 249 { 250 positionBuffer.insert(0, "(?:\\-*"); 251 positionBuffer.append(")"); 252 } 253 254 position = positionBuffer.toString(); 255 } 256 else 257 { 258 if (position.length() > 1) 259 { 260 throw new RuntimeException("Prosite position found with multiple required residues: " + StringUtil.singleQuote(position) + "!"); 261 } 262 263 char residue = position.charAt(0); 264 Nucleotide base = Nucleotide.valueOf(residue); 265 if (null == base 266 && ! position.equals("<") 267 && ! position.equals(">")) 268 { 269 throw new RuntimeException("Invalid base " + StringUtil.singleQuote(residue) + " found in the nucleotide pattern!"); 270 } 271 272 if (base != null 273 && base.isAmbiguous()) 274 { 275 if (position.equalsIgnoreCase("N")) 276 { 277 if (getIgnoreGaps()) 278 { 279 position = "[^\\-]"; 280 } 281 else 282 { 283 position = "."; 284 } 285 } 286 else 287 { 288 StringBuilder positionBuffer = new StringBuilder("["); 289 positionBuffer.append(residue); 290 291 for (Nucleotide baseOption : base.getDegeneracy()) 292 { 293 positionBuffer.append(baseOption.getOneLetterCode()); 294 } 295 296 positionBuffer.append("]"); 297 298 position = positionBuffer.toString(); 299 } 300 } 301 302 303 if (getIgnoreGaps() 304 && countSpec != null) 305 { 306 position = "(?:\\-*" + position + ")"; 307 } 308 } 309 310 311 if (position.contains("<")) 312 { 313 position = "(?:" + position.replace("<", "") + "|\\A)"; 314 } 315 else if (position.contains(">")) 316 { 317 position = "(?:" + position.replace(">", "") + "|\\Z)"; 318 } 319 320 321 if (getIgnoreGaps() 322 && countSpec != null) 323 { 324 regexp.append((nTerm ? "^" : "") 325 + position 326 + countSpec 327 + (cTerm ? "$" : "")); 328 } 329 else 330 { 331 regexp.delimitedAppend((nTerm ? "^" : "") 332 + position 333 + (countSpec != null ? countSpec : "") 334 + (cTerm ? "$" : "")); 335 } 336 } 337 338 return regexp.toString(); 339 } 340}