001package com.hfg.bio.seq.pattern; 002 003import java.util.ArrayList; 004import java.util.List; 005import java.util.regex.Matcher; 006import java.util.regex.Pattern; 007 008import com.hfg.bio.Nucleotide; 009import com.hfg.bio.seq.BioSequence; 010import com.hfg.bio.seq.BioSequenceType; 011import com.hfg.bio.seq.SeqLocation; 012import com.hfg.exception.ProgrammingException; 013import com.hfg.math.Range; 014import com.hfg.util.StringUtil; 015import com.hfg.util.collection.CollectionUtil; 016 017//------------------------------------------------------------------------------ 018/** 019 Abstract container for a sequence pattern (motif). 020 <p> 021 The pattern syntax is based on PROSITE with one addition - the symbol '!' can be added to end of a position 022 specification to indicate that mismatches are not allowed at that position. 023 </p> 024 <p> 025 From the <a href='http://prosite.expasy.org/prosuser.html#conv_pa'>PROSITE user manual</a>: 026 </p> 027 <p> 028 The patterns are described using the following conventions: 029 <ul> 030 <li>The standard IUPAC one-letter codes for the amino acids are used.</li> 031 <li>The symbol 'x' is used for a position where any amino acid is accepted.</li> 032 <li>Ambiguities are indicated by listing the acceptable amino acids for a given position, between square parentheses '[ ]'. 033 For example: [ALT] stands for Ala or Leu or Thr.</li> 034 <li>Ambiguities are also indicated by listing between a pair of curly brackets '{ }' the amino acids that are not accepted 035 at a given position. For example: {AM} stands for any amino acid except Ala and Met.</li> 036 <li>Each element in a pattern is separated from its neighbor by a '-'.</li> 037 <li>Repetition of an element of the pattern can be indicated by following that element with a numerical value or a numerical 038 range between parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to x-x or x-x-x or x-x-x-x.</li> 039 <li>When a pattern is restricted to either the N- or C-terminal of a sequence, that pattern either starts with a '<' symbol 040 or respectively ends with a '>' symbol. In some rare cases (e.g. PS00267 or PS00539), '>' can also occur inside square 041 brackets for the C-terminal element. 'F-[GSTV]-P-R-L-[G>]' means that either 'F-[GSTV]-P-R-L-G' or 'F-[GSTV]-P-R-L>' are considered.</li> 042 <li>A period ends the pattern.</li> 043 </ul> 044 <div> 045 Examples: 046 <pre> 047 PA [AC]-x-V-x(4)-{ED}. 048 </pre> 049 This pattern is translated as: [Ala or Cys]-any-Val-any-any-any-any-{any but Glu or Asp} 050 <pre> 051 PA <A-x-[ST](2)-x(0,1)-V. 052 </pre> 053 This pattern, which must be in the N-terminal of the sequence ('<'), is translated as: Ala-any-[Ser or Thr]-[Ser or Thr]-(any or none)-Val 054 </div> 055 @author J. Alex Taylor, hairyfatguy.com 056 */ 057//------------------------------------------------------------------------------ 058// com.hfg Library 059// 060// This library is free software; you can redistribute it and/or 061// modify it under the terms of the GNU Lesser General Public 062// License as published by the Free Software Foundation; either 063// version 2.1 of the License, or (at your option) any later version. 064// 065// This library is distributed in the hope that it will be useful, 066// but WITHOUT ANY WARRANTY; without even the implied warranty of 067// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 068// Lesser General Public License for more details. 069// 070// You should have received a copy of the GNU Lesser General Public 071// License along with this library; if not, write to the Free Software 072// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 073// 074// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 075// jataylor@hairyfatguy.com 076//------------------------------------------------------------------------------ 077 078public abstract class SeqPattern<S extends BioSequence, T extends SeqPatternMatch> implements Cloneable 079{ 080 private String mName; 081 private boolean mLocked; 082 private String mPrositePattern; 083 private List<PrositePatternPosition> mPatternPositions; 084 private boolean mIsCaseSensitive = false; 085 private boolean mIgnoreGaps = false; 086 private boolean mContainsPositionAmbiguity; 087 private boolean mContainsRanges; 088 private boolean mIsRestrictedToSeqStart; 089 private boolean mIsRestrictedToSeqEnd; 090 private int mMaxMismatches = 0; 091 092 protected static final Pattern PROSITE_COUNT_PATTERN = Pattern.compile("\\((\\d+(?:,\\d+)?)(\\?)?\\)$"); 093 094 //########################################################################### 095 // CONSTRUCTORS 096 //########################################################################### 097 098 //-------------------------------------------------------------------------- 099 protected SeqPattern() 100 { 101 } 102 103 //-------------------------------------------------------------------------- 104 public SeqPattern(String inPrositePattern) 105 { 106 setPrositePattern(inPrositePattern); 107 } 108 109 //########################################################################### 110 // PUBLIC METHODS 111 //########################################################################### 112 113 114 //-------------------------------------------------------------------------- 115 public SeqPattern clone() 116 { 117 SeqPattern cloneObj; 118 try 119 { 120 cloneObj = (SeqPattern) super.clone(); 121 } 122 catch (CloneNotSupportedException e) 123 { 124 throw new ProgrammingException(e); 125 } 126 127 if (mPatternPositions != null) 128 { 129 cloneObj.mPatternPositions = new ArrayList(mPatternPositions); 130 } 131 132 return cloneObj; 133 } 134 135 //-------------------------------------------------------------------------- 136 @Override 137 public String toString() 138 { 139 return mPrositePattern; 140 } 141 142 //--------------------------------------------------------------------------- 143 public SeqPattern setName(String inValue) 144 { 145 mName = inValue; 146 return this; 147 } 148 149 //--------------------------------------------------------------------------- 150 public String name() 151 { 152 return mName; 153 } 154 155 //-------------------------------------------------------------------------- 156 public boolean isLocked() 157 { 158 return mLocked; 159 } 160 161 //-------------------------------------------------------------------------- 162 public SeqPattern lock() 163 { 164 mLocked = true; 165 return this; 166 } 167 168 //-------------------------------------------------------------------------- 169 public abstract BioSequenceType getBioSequenceType(); 170 171 //-------------------------------------------------------------------------- 172 public SeqPattern setIgnoreGaps(boolean inValue) 173 { 174 if (inValue != mIgnoreGaps) 175 { 176 mIgnoreGaps = inValue; 177 } 178 179 return this; 180 } 181 182 //-------------------------------------------------------------------------- 183 public SeqPattern setIsCaseSensitive(boolean inValue) 184 { 185 mIsCaseSensitive = inValue; 186 187 return this; 188 } 189 190 //-------------------------------------------------------------------------- 191 public boolean isCaseSensitive() 192 { 193 return mIsCaseSensitive; 194 } 195 196 //-------------------------------------------------------------------------- 197 public boolean getIgnoreGaps() 198 { 199 return mIgnoreGaps; 200 } 201 202 //-------------------------------------------------------------------------- 203 protected void setPrositePattern(String inValue) 204 { 205 if (inValue != null) 206 { 207 mPrositePattern = inValue.trim(); 208 if (! isCaseSensitive()) 209 { 210 mPrositePattern = mPrositePattern.toUpperCase(); 211 } 212 } 213 214 evaluate(mPrositePattern); 215 } 216 217 //-------------------------------------------------------------------------- 218 public String getPrositePattern() 219 { 220 return mPrositePattern; 221 } 222 223 //-------------------------------------------------------------------------- 224 public SeqPattern setMaxMismatches(int inValue) 225 { 226 // TODO: check number against the number of pattern positions 227 mMaxMismatches = inValue; 228 return this; 229 } 230 231 //-------------------------------------------------------------------------- 232 public int getMaxMismatches() 233 { 234 return mMaxMismatches; 235 } 236 237 //-------------------------------------------------------------------------- 238 public boolean containsMismatchRestrictions() 239 { 240 boolean result = false; 241 242 if (CollectionUtil.hasValues(getPrositePatternPositions())) 243 { 244 for (PrositePatternPosition position : getPrositePatternPositions()) 245 { 246 if (position.mismatchNotAllowed()) 247 { 248 result = true; 249 break; 250 } 251 } 252 } 253 254 return result; 255 } 256 257 //-------------------------------------------------------------------------- 258 public boolean containsPositionAmbiguity() 259 { 260 return mContainsPositionAmbiguity; 261 } 262 263 //-------------------------------------------------------------------------- 264 public boolean containsRanges() 265 { 266 return mContainsRanges; 267 } 268 269 //-------------------------------------------------------------------------- 270 public boolean isRestrictedToSeqStart() 271 { 272 return mIsRestrictedToSeqStart; 273 } 274 275 //-------------------------------------------------------------------------- 276 public boolean isRestrictedToSeqEnd() 277 { 278 return mIsRestrictedToSeqEnd; 279 } 280 281 282 283 //--------------------------------------------------------------------------- 284 public int getMaxLength() 285 { 286 int maxLength = 0; 287 List<PrositePatternPosition> positions = getPrositePatternPositions(); 288 if (CollectionUtil.hasValues(positions)) 289 { 290 maxLength = positions.size(); 291 for (PrositePatternPosition position : positions) 292 { 293 if (position.hasCountRange()) 294 { 295 maxLength += position.getCountRange().getEnd() - 1; 296 } 297 } 298 } 299 300 return maxLength; 301 } 302 303 //--------------------------------------------------------------------------- 304 public List<PrositePatternPosition> getPrositePatternPositions() 305 { 306 if (null == mPatternPositions) 307 { 308 String prositePattern = getPrositePattern(); 309 310 // Remove the period at the end 311 if (prositePattern.endsWith(".")) 312 { 313 prositePattern = prositePattern.substring(0, prositePattern.length() - 1); 314 } 315 316 // Dashes separate positions 317 String[] positions = prositePattern.split("\\-"); 318 List<PrositePatternPosition> patternPositions = new ArrayList<>(positions.length); 319 int positionIndex = 0; 320 for (String positionString : positions) 321 { 322 positionIndex++; 323 324 PrositePatternPosition position = new PrositePatternPosition(); 325 326 if (positionString.endsWith("!")) // Mismatch not allowed? 327 { 328 position.setMismatchNotAllowed(true); 329 positionString = positionString.substring(0, positionString.length() - 1); 330 } 331 332 if (positionString.startsWith("<")) // N-terminus? 333 { 334 positionString = positionString.substring(1); 335 } 336 337 if (positionString.endsWith(">")) // C-terminus? 338 { 339 positionString = positionString.substring(0, positionString.length() - 1); 340 } 341 342 343 Matcher m = SeqPattern.PROSITE_COUNT_PATTERN.matcher(positionString); 344 if (m.find()) 345 { 346 int min = -1; 347 int max = -1; 348 String[] pieces = m.group(1).split(","); 349 if (1 == pieces.length) 350 { 351 min = max = Integer.parseInt(pieces[0]); 352 } 353 else 354 { 355 min = Integer.parseInt(pieces[0]); 356 max = Integer.parseInt(pieces[1]); 357 } 358 359 position.setCountRange(new Range<>(min, max)); 360 positionString = positionString.substring(0, m.start(1) - 1); 361 position.setUseLazyMatchMode(m.group(2) != null); 362 } 363 364 if (positionString.startsWith("[")) 365 { 366 position.setType(PrositePatternPositionType.ONE_OF); 367 368 // TODO: Handle ambiguous protein residues 369 StringBuilder buffer = new StringBuilder(); 370 for (int i = 1; i < positionString.length() - 1; i++) 371 { 372 char residue = Character.toUpperCase(positionString.charAt(i)); 373 buffer.append(residue); 374 375 if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)) 376 { 377 Nucleotide base = Nucleotide.valueOf(residue); 378 if (null == base) 379 { 380 throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(positions[positionIndex - 1]) + " contains an invalid nucleotide value!"); 381 } 382 383 if (base.isAmbiguous()) 384 { 385 buffer.append(base.getDegeneracyAsString()); 386 } 387 } 388 } 389 390 position.setResidues(buffer.toString()); 391 } 392 else if (positionString.startsWith("{")) 393 { 394 position.setType(PrositePatternPositionType.NOT); 395 396 // TODO: Handle ambiguous protein residues 397 StringBuilder buffer = new StringBuilder(); 398 for (int i = 1; i < positionString.length() - 1; i++) 399 { 400 char residue = Character.toUpperCase(positionString.charAt(i)); 401 buffer.append(residue); 402 403 if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)) 404 { 405 Nucleotide base = Nucleotide.valueOf(positionString.toUpperCase()); 406 if (base.isAmbiguous()) 407 { 408 buffer.append(base.getDegeneracyAsString()); 409 } 410 } 411 } 412 413 position.setResidues(buffer.toString()); 414 } 415 else if ((positionString.equalsIgnoreCase("X") 416 && getBioSequenceType().equals(BioSequenceType.PROTEIN)) 417 || (positionString.equalsIgnoreCase("N") 418 && getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))) 419 { 420 position.setType(PrositePatternPositionType.IS_ANY); 421 } 422 else 423 { 424 PrositePatternPositionType type = PrositePatternPositionType.IS; 425 426 // TODO: Handle ambiguous protein residues 427 if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)) 428 { 429 Nucleotide base = Nucleotide.valueOf(positionString.toUpperCase()); 430 if (base.isAmbiguous()) 431 { 432 type = PrositePatternPositionType.ONE_OF; 433 positionString += base.getDegeneracyAsString(); 434 } 435 } 436 437 position.setType(type); 438 position.setResidues(positionString); 439 } 440 441 if (position.getCountRange() != null 442 && 1 == position.getCountRange().length()) 443 { 444 // Fixed number of identical positions. Unroll rather than treating as a range 445 int count = position.getCountRange().getStart(); 446 position.setCountRange(null); 447 for (int i = 0; i < count; i++) 448 { 449 patternPositions.add(position); 450 } 451 } 452 else 453 { 454 patternPositions.add(position); 455 } 456 457 458 } 459 460 mPatternPositions = patternPositions; 461 } 462 463 return mPatternPositions; 464 } 465 466 //-------------------------------------------------------------------------- 467 public SeqPatternMatcher<S, T> matcher(S inTarget) 468 { 469 return matcher(inTarget, null); 470 } 471 472 //-------------------------------------------------------------------------- 473 public SeqPatternMatcher<S, T> matcher(S inTarget, SeqLocation inSeqLocation) 474 { 475 SeqPatternMatcher<S, T> matcher; 476 477 if (0 == getMaxMismatches()) 478 { 479 matcher = new RegExpMatcher<>(this, inTarget, inSeqLocation); 480 } 481 else if (! containsPositionAmbiguity() 482 && ! containsRanges() 483 && ! containsMismatchRestrictions()) 484 { 485 matcher = new BYPMatcher<>(this, inTarget, inSeqLocation); 486 } 487 else 488 { 489 // Default to brute force 490 matcher = new BruteForceMatcher<>(this, inTarget, inSeqLocation); 491 } 492 493 return matcher; 494 } 495 496 //########################################################################### 497 // PROTECTED METHODS 498 //########################################################################### 499 500 //-------------------------------------------------------------------------- 501 protected abstract T createMatch(String inSeq, SeqLocation inLocation); 502 503 //--------------------------------------------------------------------------- 504 private void evaluate(String inPrositePattern) 505 { 506 String prositePattern = inPrositePattern; 507 508 // Remove the period at the end 509 if (prositePattern.endsWith(".")) 510 { 511 prositePattern = prositePattern.substring(0, prositePattern.length() - 1); 512 } 513 514 // Dashes separate positions 515 String[] positions = prositePattern.split("\\-"); 516 int positionIndex = 0; 517 for (String position : positions) 518 { 519 positionIndex++; 520 521 position = position.trim(); 522 523 524 if (position.endsWith("!")) // Mismatch not allowed? 525 { 526 // Remove it so we can detect C-terminal restriction 527 position = position.substring(0, position.length() - 1); 528 } 529 530 531 if (position.startsWith("<")) // N-terminus? 532 { 533 mIsRestrictedToSeqStart = true; 534 position = position.substring(1); 535 } 536 537 if (position.endsWith(">")) // C-terminus? 538 { 539 mIsRestrictedToSeqEnd = true; 540 position = position.substring(0, position.length() - 1); 541 } 542 543 // Extract the count spec if present 544 String countSpec = null; 545 Matcher m = PROSITE_COUNT_PATTERN.matcher(position); 546 if (m.find()) 547 { 548 mContainsRanges = true; 549 } 550 551 if (position.startsWith("{")) 552 { 553 if (! position.endsWith("}")) 554 { 555 throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(position) + " of Prosite pattern " 556 + StringUtil.singleQuote(inPrositePattern) + " starts with '{' but doesn't end with '}'!"); 557 } 558 559 mContainsPositionAmbiguity = true; 560 } 561 else if (position.startsWith("[")) 562 { 563 if (! position.endsWith("]")) 564 { 565 throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(position) + " of Prosite pattern " 566 + StringUtil.singleQuote(inPrositePattern) + " starts with '[' but doesn't end with ']'!"); 567 } 568 569 mContainsPositionAmbiguity = true; 570 } 571 else if (getBioSequenceType().equals(BioSequenceType.PROTEIN) 572 && position.equalsIgnoreCase("x")) 573 { 574 mContainsPositionAmbiguity = true; 575 } 576 else if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID) 577 && Nucleotide.valueOf(position.charAt(0)).isAmbiguous()) 578 { 579 mContainsPositionAmbiguity = true; 580 } 581 } 582 } 583}