001package com.hfg.bio.seq.pattern; 002 003import java.util.ArrayList; 004import java.util.List; 005import java.util.regex.Matcher; 006 007import com.hfg.bio.seq.BioSequenceType; 008import com.hfg.bio.seq.Protein; 009import com.hfg.bio.seq.SeqLocation; 010 011//------------------------------------------------------------------------------ 012/** 013 Container for a protein pattern (motif). 014 <p> 015 From the <a href='http://prosite.expasy.org/prosuser.html#conv_pa'>PROSITE user manual</a>: 016 </p> 017 <p> 018 The patterns are described using the following conventions: 019 <ul> 020 <li>The standard IUPAC one-letter codes for the amino acids are used.</li> 021 <li>The symbol 'x' is used for a position where any amino acid is accepted.</li> 022 <li>Ambiguities are indicated by listing the acceptable amino acids for a given position, between square parentheses '[ ]'. 023 For example: [ALT] stands for Ala or Leu or Thr.</li> 024 <li>Ambiguities are also indicated by listing between a pair of curly brackets '{ }' the amino acids that are not accepted 025 at a given position. For example: {AM} stands for any amino acid except Ala and Met.</li> 026 <li>Each element in a pattern is separated from its neighbor by a '-'.</li> 027 <li>Repetition of an element of the pattern can be indicated by following that element with a numerical value or a numerical 028 range between parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to x-x or x-x-x or x-x-x-x.</li> 029 <li>When a pattern is restricted to either the N- or C-terminal of a sequence, that pattern either starts with a '<' symbol 030 or respectively ends with a '>' symbol. In some rare cases (e.g. PS00267 or PS00539), '>' can also occur inside square 031 brackets for the C-terminal element. 'F-[GSTV]-P-R-L-[G>]' means that either 'F-[GSTV]-P-R-L-G' or 'F-[GSTV]-P-R-L>' are considered.</li> 032 <li>A period ends the pattern.</li> 033 </ul> 034 <div> 035 Examples: 036 <pre> 037 PA [AC]-x-V-x(4)-{ED}. 038 </pre> 039 This pattern is translated as: [Ala or Cys]-any-Val-any-any-any-any-{any but Glu or Asp} 040 <pre> 041 PA <A-x-[ST](2)-x(0,1)-V. 042 </pre> 043 This pattern, which must be in the N-terminal of the sequence ('<'), is translated as: Ala-any-[Ser or Thr]-[Ser or Thr]-(any or none)-Val 044 </div> 045 @author J. Alex Taylor, hairyfatguy.com 046 */ 047//------------------------------------------------------------------------------ 048// com.hfg Library 049// 050// This library is free software; you can redistribute it and/or 051// modify it under the terms of the GNU Lesser General Public 052// License as published by the Free Software Foundation; either 053// version 2.1 of the License, or (at your option) any later version. 054// 055// This library is distributed in the hope that it will be useful, 056// but WITHOUT ANY WARRANTY; without even the implied warranty of 057// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 058// Lesser General Public License for more details. 059// 060// You should have received a copy of the GNU Lesser General Public 061// License along with this library; if not, write to the Free Software 062// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 063// 064// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 065// jataylor@hairyfatguy.com 066//------------------------------------------------------------------------------ 067 068public class ProteinPattern extends SeqPattern<Protein, ProteinPatternMatch> 069{ 070 //########################################################################### 071 // CONSTRUCTORS 072 //########################################################################### 073 074 //-------------------------------------------------------------------------- 075 public ProteinPattern(String inPrositePattern) 076 { 077 super(inPrositePattern); 078 } 079 080 //########################################################################### 081 // PUBLIC METHODS 082 //########################################################################### 083 084 //-------------------------------------------------------------------------- 085 public BioSequenceType getBioSequenceType() 086 { 087 return BioSequenceType.PROTEIN; 088 } 089 090 //-------------------------------------------------------------------------- 091 public ProteinPattern setIgnoreGaps(boolean inValue) 092 { 093 return (ProteinPattern) super.setIgnoreGaps(inValue); 094 } 095 096 //-------------------------------------------------------------------------- 097 @Override 098 public ProteinPattern setMaxMismatches(int inValue) 099 { 100 return (ProteinPattern) super.setMaxMismatches(inValue); 101 } 102 103 //-------------------------------------------------------------------------- 104 protected ProteinPatternMatch createMatch(String inSeq, SeqLocation inLocation) 105 { 106 return new ProteinPatternMatch(this, inSeq, inLocation); 107 } 108 109}