001package com.hfg.bio.seq.pattern;
002
003import java.util.ArrayList;
004import java.util.List;
005import java.util.regex.Matcher;
006import java.util.regex.Pattern;
007
008import com.hfg.bio.Nucleotide;
009import com.hfg.bio.seq.BioSequence;
010import com.hfg.bio.seq.BioSequenceType;
011import com.hfg.bio.seq.SeqLocation;
012import com.hfg.exception.ProgrammingException;
013import com.hfg.math.Range;
014import com.hfg.util.StringUtil;
015import com.hfg.util.collection.CollectionUtil;
016
017//------------------------------------------------------------------------------
018/**
019 Abstract container for a sequence pattern (motif).
020 <p>
021 The pattern syntax is based on PROSITE with one addition - the symbol '!' can be added to end of a position
022 specification to indicate that mismatches are not allowed at that position.
023 </p>
024 <p>
025 From the <a href='http://prosite.expasy.org/prosuser.html#conv_pa'>PROSITE user manual</a>:
026 </p>
027 <p>
028 The patterns are described using the following conventions:
029 <ul>
030   <li>The standard IUPAC one-letter codes for the amino acids are used.</li>
031   <li>The symbol 'x' is used for a position where any amino acid is accepted.</li>
032   <li>Ambiguities are indicated by listing the acceptable amino acids for a given position, between square parentheses '[ ]'.
033       For example: [ALT] stands for Ala or Leu or Thr.</li>
034   <li>Ambiguities are also indicated by listing between a pair of curly brackets '{ }' the amino acids that are not accepted
035       at a given position. For example: {AM} stands for any amino acid except Ala and Met.</li>
036   <li>Each element in a pattern is separated from its neighbor by a '-'.</li>
037   <li>Repetition of an element of the pattern can be indicated by following that element with a numerical value or a numerical
038       range between parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to x-x or x-x-x or x-x-x-x.</li>
039   <li>When a pattern is restricted to either the N- or C-terminal of a sequence, that pattern either starts with a '&lt;' symbol
040       or respectively ends with a '&gt;' symbol. In some rare cases (e.g. PS00267 or PS00539), '&gt;' can also occur inside square
041       brackets for the C-terminal element. 'F-[GSTV]-P-R-L-[G&gt;]' means that either 'F-[GSTV]-P-R-L-G' or 'F-[GSTV]-P-R-L&gt;' are considered.</li>
042   <li>A period ends the pattern.</li>
043 </ul>
044 <div>
045   Examples:
046   <pre>
047    PA   [AC]-x-V-x(4)-{ED}.
048   </pre>
049   This pattern is translated as: [Ala or Cys]-any-Val-any-any-any-any-{any but Glu or Asp}
050   <pre>
051    PA   &lt;A-x-[ST](2)-x(0,1)-V.
052   </pre>
053   This pattern, which must be in the N-terminal of the sequence ('&lt;'), is translated as: Ala-any-[Ser or Thr]-[Ser or Thr]-(any or none)-Val
054 </div>
055 @author J. Alex Taylor, hairyfatguy.com
056 */
057//------------------------------------------------------------------------------
058// com.hfg Library
059//
060// This library is free software; you can redistribute it and/or
061// modify it under the terms of the GNU Lesser General Public
062// License as published by the Free Software Foundation; either
063// version 2.1 of the License, or (at your option) any later version.
064//
065// This library is distributed in the hope that it will be useful,
066// but WITHOUT ANY WARRANTY; without even the implied warranty of
067// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
068// Lesser General Public License for more details.
069//
070// You should have received a copy of the GNU Lesser General Public
071// License along with this library; if not, write to the Free Software
072// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
073//
074// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
075// jataylor@hairyfatguy.com
076//------------------------------------------------------------------------------
077
078public abstract class SeqPattern<S extends BioSequence, T extends SeqPatternMatch> implements Cloneable
079{
080   private String  mName;
081   private boolean mLocked;
082   private String  mPrositePattern;
083   private List<PrositePatternPosition> mPatternPositions;
084   private boolean mIsCaseSensitive = false;
085   private boolean mIgnoreGaps = false;
086   private boolean mContainsPositionAmbiguity;
087   private boolean mContainsRanges;
088   private boolean mIsRestrictedToSeqStart;
089   private boolean mIsRestrictedToSeqEnd;
090   private int     mMaxMismatches = 0;
091
092   protected static final Pattern PROSITE_COUNT_PATTERN = Pattern.compile("\\((\\d+(?:,\\d+)?)(\\?)?\\)$");
093
094   //###########################################################################
095   // CONSTRUCTORS
096   //###########################################################################
097
098   //--------------------------------------------------------------------------
099   protected SeqPattern()
100   {
101   }
102
103   //--------------------------------------------------------------------------
104   public SeqPattern(String inPrositePattern)
105   {
106      setPrositePattern(inPrositePattern);
107   }
108
109   //###########################################################################
110   // PUBLIC METHODS
111   //###########################################################################
112
113
114   //--------------------------------------------------------------------------
115   public SeqPattern clone()
116   {
117      SeqPattern cloneObj;
118      try
119      {
120         cloneObj = (SeqPattern) super.clone();
121      }
122      catch (CloneNotSupportedException e)
123      {
124         throw new ProgrammingException(e);
125      }
126
127      if (mPatternPositions != null)
128      {
129         cloneObj.mPatternPositions = new ArrayList(mPatternPositions);
130      }
131
132      return cloneObj;
133   }
134
135   //--------------------------------------------------------------------------
136   @Override
137   public String toString()
138   {
139      return mPrositePattern;
140   }
141
142   //---------------------------------------------------------------------------
143   public SeqPattern setName(String inValue)
144   {
145      mName = inValue;
146      return this;
147   }
148
149   //---------------------------------------------------------------------------
150   public String name()
151   {
152      return mName;
153   }
154
155   //--------------------------------------------------------------------------
156   public boolean isLocked()
157   {
158      return mLocked;
159   }
160
161   //--------------------------------------------------------------------------
162   public SeqPattern lock()
163   {
164      mLocked = true;
165      return this;
166   }
167
168   //--------------------------------------------------------------------------
169   public abstract BioSequenceType getBioSequenceType();
170
171   //--------------------------------------------------------------------------
172   public SeqPattern setIgnoreGaps(boolean inValue)
173   {
174      if (inValue != mIgnoreGaps)
175      {
176         mIgnoreGaps = inValue;
177      }
178
179      return this;
180   }
181
182   //--------------------------------------------------------------------------
183   public SeqPattern setIsCaseSensitive(boolean inValue)
184   {
185      mIsCaseSensitive = inValue;
186      
187      return this;
188   }
189
190   //--------------------------------------------------------------------------
191   public boolean isCaseSensitive()
192   {
193      return mIsCaseSensitive;
194   }
195
196   //--------------------------------------------------------------------------
197   public boolean getIgnoreGaps()
198   {
199      return mIgnoreGaps;
200   }
201
202   //--------------------------------------------------------------------------
203   protected void setPrositePattern(String inValue)
204   {
205      if (inValue != null)
206      {
207         mPrositePattern = inValue.trim();
208         if (! isCaseSensitive())
209         {
210            mPrositePattern = mPrositePattern.toUpperCase();
211         }
212      }
213
214      evaluate(mPrositePattern);
215   }
216
217   //--------------------------------------------------------------------------
218   public String getPrositePattern()
219   {
220      return mPrositePattern;
221   }
222
223   //--------------------------------------------------------------------------
224   public SeqPattern setMaxMismatches(int inValue)
225   {
226      // TODO: check number against the number of pattern positions
227      mMaxMismatches = inValue;
228      return this;
229   }
230
231   //--------------------------------------------------------------------------
232   public int getMaxMismatches()
233   {
234      return mMaxMismatches;
235   }
236
237   //--------------------------------------------------------------------------
238   public boolean containsMismatchRestrictions()
239   {
240      boolean result = false;
241
242      if (CollectionUtil.hasValues(getPrositePatternPositions()))
243      {
244         for (PrositePatternPosition position : getPrositePatternPositions())
245         {
246            if (position.mismatchNotAllowed())
247            {
248               result = true;
249               break;
250            }
251         }
252      }
253
254      return result;
255   }
256
257   //--------------------------------------------------------------------------
258   public boolean containsPositionAmbiguity()
259   {
260      return mContainsPositionAmbiguity;
261   }
262
263   //--------------------------------------------------------------------------
264   public boolean containsRanges()
265   {
266      return mContainsRanges;
267   }
268
269   //--------------------------------------------------------------------------
270   public boolean isRestrictedToSeqStart()
271   {
272      return mIsRestrictedToSeqStart;
273   }
274
275   //--------------------------------------------------------------------------
276   public boolean isRestrictedToSeqEnd()
277   {
278      return mIsRestrictedToSeqEnd;
279   }
280
281
282
283   //---------------------------------------------------------------------------
284   public int getMaxLength()
285   {
286      int maxLength = 0;
287      List<PrositePatternPosition> positions = getPrositePatternPositions();
288      if (CollectionUtil.hasValues(positions))
289      {
290         maxLength = positions.size();
291         for (PrositePatternPosition position : positions)
292         {
293            if (position.hasCountRange())
294            {
295               maxLength += position.getCountRange().getEnd() - 1;
296            }
297         }
298      }
299
300      return maxLength;
301   }
302
303   //---------------------------------------------------------------------------
304   public List<PrositePatternPosition> getPrositePatternPositions()
305   {
306      if (null == mPatternPositions)
307      {
308         String prositePattern = getPrositePattern();
309
310         // Remove the period at the end
311         if (prositePattern.endsWith("."))
312         {
313            prositePattern = prositePattern.substring(0, prositePattern.length() - 1);
314         }
315
316         // Dashes separate positions
317         String[] positions = prositePattern.split("\\-");
318         List<PrositePatternPosition> patternPositions = new ArrayList<>(positions.length);
319         int positionIndex = 0;
320         for (String positionString : positions)
321         {
322            positionIndex++;
323
324            PrositePatternPosition position = new PrositePatternPosition();
325
326            if (positionString.endsWith("!")) // Mismatch not allowed?
327            {
328               position.setMismatchNotAllowed(true);
329               positionString = positionString.substring(0, positionString.length() - 1);
330            }
331
332            if (positionString.startsWith("<")) // N-terminus?
333            {
334               positionString = positionString.substring(1);
335            }
336
337            if (positionString.endsWith(">")) // C-terminus?
338            {
339               positionString = positionString.substring(0, positionString.length() - 1);
340            }
341
342
343            Matcher m = SeqPattern.PROSITE_COUNT_PATTERN.matcher(positionString);
344            if (m.find())
345            {
346               int min = -1;
347               int max = -1;
348               String[] pieces = m.group(1).split(",");
349               if (1 == pieces.length)
350               {
351                  min = max = Integer.parseInt(pieces[0]);
352               }
353               else
354               {
355                  min = Integer.parseInt(pieces[0]);
356                  max = Integer.parseInt(pieces[1]);
357               }
358
359               position.setCountRange(new Range<>(min, max));
360               positionString = positionString.substring(0, m.start(1) - 1);
361               position.setUseLazyMatchMode(m.group(2) != null);
362            }
363
364            if (positionString.startsWith("["))
365            {
366               position.setType(PrositePatternPositionType.ONE_OF);
367
368               // TODO: Handle ambiguous protein residues
369               StringBuilder buffer = new StringBuilder();
370               for (int i = 1; i < positionString.length() - 1; i++)
371               {
372                  char residue = Character.toUpperCase(positionString.charAt(i));
373                  buffer.append(residue);
374
375                  if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))
376                  {
377                     Nucleotide base = Nucleotide.valueOf(residue);
378                     if (null == base)
379                     {
380                        throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(positions[positionIndex - 1]) + " contains an invalid nucleotide value!");
381                     }
382
383                     if (base.isAmbiguous())
384                     {
385                        buffer.append(base.getDegeneracyAsString());
386                     }
387                  }
388               }
389
390               position.setResidues(buffer.toString());
391            }
392            else if (positionString.startsWith("{"))
393            {
394               position.setType(PrositePatternPositionType.NOT);
395
396               // TODO: Handle ambiguous protein residues
397               StringBuilder buffer = new StringBuilder();
398               for (int i = 1; i < positionString.length() - 1; i++)
399               {
400                  char residue = Character.toUpperCase(positionString.charAt(i));
401                  buffer.append(residue);
402
403                  if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))
404                  {
405                     Nucleotide base = Nucleotide.valueOf(positionString.toUpperCase());
406                     if (base.isAmbiguous())
407                     {
408                        buffer.append(base.getDegeneracyAsString());
409                     }
410                  }
411               }
412
413               position.setResidues(buffer.toString());
414            }
415            else if ((positionString.equalsIgnoreCase("X")
416                      && getBioSequenceType().equals(BioSequenceType.PROTEIN))
417                     || (positionString.equalsIgnoreCase("N")
418                         && getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)))
419            {
420               position.setType(PrositePatternPositionType.IS_ANY);
421            }
422            else
423            {
424               PrositePatternPositionType type = PrositePatternPositionType.IS;
425
426               // TODO: Handle ambiguous protein residues
427               if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID))
428               {
429                  Nucleotide base = Nucleotide.valueOf(positionString.toUpperCase());
430                  if (base.isAmbiguous())
431                  {
432                     type = PrositePatternPositionType.ONE_OF;
433                     positionString += base.getDegeneracyAsString();
434                  }
435               }
436
437               position.setType(type);
438               position.setResidues(positionString);
439            }
440
441            if (position.getCountRange() != null
442                && 1 == position.getCountRange().length())
443            {
444               // Fixed number of identical positions. Unroll rather than treating as a range
445               int count = position.getCountRange().getStart();
446               position.setCountRange(null);
447               for (int i = 0; i < count; i++)
448               {
449                  patternPositions.add(position);
450               }
451            }
452            else
453            {
454               patternPositions.add(position);
455            }
456
457
458         }
459
460         mPatternPositions = patternPositions;
461      }
462
463      return mPatternPositions;
464   }
465
466   //--------------------------------------------------------------------------
467   public SeqPatternMatcher<S, T> matcher(S inTarget)
468   {
469      return matcher(inTarget, null);
470   }
471
472   //--------------------------------------------------------------------------
473   public SeqPatternMatcher<S, T> matcher(S inTarget, SeqLocation inSeqLocation)
474   {
475      SeqPatternMatcher<S, T> matcher;
476
477      if (0 == getMaxMismatches())
478      {
479         matcher = new RegExpMatcher<>(this, inTarget, inSeqLocation);
480      }
481      else if (! containsPositionAmbiguity()
482               && ! containsRanges()
483               && ! containsMismatchRestrictions())
484      {
485         matcher = new BYPMatcher<>(this, inTarget, inSeqLocation);
486      }
487      else
488      {
489         // Default to brute force
490         matcher = new BruteForceMatcher<>(this, inTarget, inSeqLocation);
491      }
492
493      return matcher;
494   }
495
496   //###########################################################################
497   // PROTECTED METHODS
498   //###########################################################################
499
500   //--------------------------------------------------------------------------
501   protected abstract T createMatch(String inSeq, SeqLocation inLocation);
502
503   //---------------------------------------------------------------------------
504   private void evaluate(String inPrositePattern)
505   {
506      String prositePattern = inPrositePattern;
507
508      // Remove the period at the end
509      if (prositePattern.endsWith("."))
510      {
511         prositePattern = prositePattern.substring(0, prositePattern.length() - 1);
512      }
513
514      // Dashes separate positions
515      String[] positions = prositePattern.split("\\-");
516      int positionIndex = 0;
517      for (String position : positions)
518      {
519         positionIndex++;
520
521         position = position.trim();
522
523
524         if (position.endsWith("!")) // Mismatch not allowed?
525         {
526            // Remove it so we can detect C-terminal restriction
527            position = position.substring(0, position.length() - 1);
528         }
529
530
531         if (position.startsWith("<")) // N-terminus?
532         {
533            mIsRestrictedToSeqStart = true;
534            position = position.substring(1);
535         }
536
537         if (position.endsWith(">")) // C-terminus?
538         {
539            mIsRestrictedToSeqEnd = true;
540            position = position.substring(0, position.length() - 1);
541         }
542
543         // Extract the count spec if present
544         String countSpec = null;
545         Matcher m = PROSITE_COUNT_PATTERN.matcher(position);
546         if (m.find())
547         {
548            mContainsRanges = true;
549         }
550
551         if (position.startsWith("{"))
552         {
553            if (! position.endsWith("}"))
554            {
555               throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(position) + " of Prosite pattern "
556                                                          + StringUtil.singleQuote(inPrositePattern) + " starts with '{' but doesn't end with '}'!");
557            }
558
559            mContainsPositionAmbiguity = true;
560         }
561         else if (position.startsWith("["))
562         {
563            if (! position.endsWith("]"))
564            {
565               throw new SeqPatternConfigurationException("Position " + positionIndex + " " + StringUtil.singleQuote(position) + " of Prosite pattern "
566                                                          + StringUtil.singleQuote(inPrositePattern) + " starts with '[' but doesn't end with ']'!");
567            }
568
569            mContainsPositionAmbiguity = true;
570         }
571         else if (getBioSequenceType().equals(BioSequenceType.PROTEIN)
572                  && position.equalsIgnoreCase("x"))
573         {
574            mContainsPositionAmbiguity = true;
575         }
576         else if (getBioSequenceType().equals(BioSequenceType.NUCLEIC_ACID)
577                  && Nucleotide.valueOf(position.charAt(0)).isAmbiguous())
578         {
579            mContainsPositionAmbiguity = true;
580         }
581      }
582   }
583}