Source code

001package com.hfg.bio.seq.pattern;
002
003import java.util.ArrayList;
004import java.util.Arrays;
005import java.util.Collection;
006import java.util.List;
007import java.util.regex.Matcher;
008import java.util.regex.Pattern;
009
010import com.hfg.bio.Nucleotide;
011import com.hfg.bio.Strand;
012import com.hfg.bio.seq.BioSequenceType;
013import com.hfg.bio.seq.NucleicAcid;
014import com.hfg.bio.seq.SeqLocation;
015import com.hfg.util.StringBuilderPlus;
016import com.hfg.util.StringUtil;
017
018//------------------------------------------------------------------------------
019/**
020 Container for a nucleotide pattern (motif).
021
022 <div>
023 @author J. Alex Taylor, hairyfatguy.com
024 </div>
025 */
026//------------------------------------------------------------------------------
027// com.hfg Library
028//
029// This library is free software; you can redistribute it and/or
030// modify it under the terms of the GNU Lesser General Public
031// License as published by the Free Software Foundation; either
032// version 2.1 of the License, or (at your option) any later version.
033//
034// This library is distributed in the hope that it will be useful,
035// but WITHOUT ANY WARRANTY; without even the implied warranty of
036// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
037// Lesser General Public License for more details.
038//
039// You should have received a copy of the GNU Lesser General Public
040// License along with this library; if not, write to the Free Software
041// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
042//
043// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
044// jataylor@hairyfatguy.com
045//------------------------------------------------------------------------------
046
047public class NucleotidePattern<T extends NucleotidePatternMatch> extends SeqPattern<NucleicAcid, T>
048{
049   private List<Strand> mStrandsToSearch = Arrays.asList(Strand.values()); // Default to searching both strands
050   private Pattern  mAlternateStrandPattern;
051
052   //###########################################################################
053   // CONSTRUCTORS
054   //###########################################################################
055
056   //--------------------------------------------------------------------------
057   protected NucleotidePattern()
058   {
059   }
060
061   //--------------------------------------------------------------------------
062   public NucleotidePattern(String inPatternString)
063   {
064      super(inPatternString.toUpperCase());
065   }
066
067   //###########################################################################
068   // PUBLIC METHODS
069   //###########################################################################
070
071   //--------------------------------------------------------------------------
072   public NucleotidePattern clone()
073   {
074      NucleotidePattern cloneObj = (NucleotidePattern) super.clone();
075
076      if (mStrandsToSearch != null)
077      {
078         cloneObj.mStrandsToSearch = new ArrayList(mStrandsToSearch);
079      }
080
081      return cloneObj;
082   }
083
084   //--------------------------------------------------------------------------
085   public BioSequenceType getBioSequenceType()
086   {
087      return BioSequenceType.NUCLEIC_ACID;
088   }
089
090   //--------------------------------------------------------------------------
091   @Override
092   public NucleotidePattern setIgnoreGaps(boolean inValue)
093   {
094      return (NucleotidePattern) super.setIgnoreGaps(inValue);
095   }
096
097   //--------------------------------------------------------------------------
098   @Override
099   public NucleotidePattern setMaxMismatches(int inValue)
100   {
101      return (NucleotidePattern) super.setMaxMismatches(inValue);
102   }
103
104   //--------------------------------------------------------------------------
105   public NucleotidePattern setStrandsToSearch(Strand inValue)
106   {
107      mStrandsToSearch = new ArrayList<>(1);
108      mStrandsToSearch.add(inValue);
109      return this;
110   }
111
112   //--------------------------------------------------------------------------
113   public NucleotidePattern setStrandsToSearch(Strand[] inValues)
114   {
115      mStrandsToSearch = Arrays.asList(inValues);
116      return this;
117   }
118
119   //--------------------------------------------------------------------------
120   public Collection<Strand> getStrandsToSearch()
121   {
122      return mStrandsToSearch;
123   }
124
125
126   //--------------------------------------------------------------------------
127   @Override
128   protected T createMatch(String inSeq, SeqLocation inLocation)
129   {
130      return (T) new NucleotidePatternMatch(this, inSeq, inLocation);
131   }
132
133   //###########################################################################
134   // PROTECTED METHODS
135   //###########################################################################
136
137   //--------------------------------------------------------------------------
138   protected String convertStringToRegExp(String inPrositePattern)
139   {
140      // Remove the period at the end
141      if (inPrositePattern.endsWith("."))
142      {
143         inPrositePattern = inPrositePattern.substring(0, inPrositePattern.length() - 1);
144      }
145
146      StringBuilderPlus regexp = new StringBuilderPlus().setDelimiter(getIgnoreGaps() ? "\\-*" : "");
147
148      // Dashes separate positions
149      String[] positions = inPrositePattern.split("\\-");
150      for (String position : positions)
151      {
152         boolean nTerm = false;
153         boolean cTerm = false;
154         if (position.startsWith("<")) // 5'?
155         {
156            nTerm = true;
157            position = position.substring(1);
158         }
159
160         if (position.endsWith(">")) // 3'?
161         {
162            cTerm = true;
163            position = position.substring(0, position.length() - 1);
164         }
165
166         // Extract the count spec if present
167         String countSpec = null;
168         Matcher m = PROSITE_COUNT_PATTERN.matcher(position);
169         if (m.find())
170         {
171            countSpec = "{" + m.group(1) + "}";
172            position = position.substring(0, m.start(1) - 1);
173         }
174
175         if (position.startsWith("{")
176                  && position.endsWith("}"))
177         {
178            StringBuilder positionBuffer = new StringBuilder("[^");
179            for (int i = 1; i < position.length() - 1; i++)
180            {
181               char theChar = position.charAt(i);
182
183               Nucleotide base = Nucleotide.valueOf(theChar);
184               if (null == base
185                   && theChar != '<'
186                   && theChar != '>')
187               {
188                  throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
189               }
190
191               positionBuffer.append(theChar);
192               if (base != null
193                   && base.isAmbiguous())
194               {
195                  for (Nucleotide baseOption : base.getDegeneracy())
196                  {
197                     positionBuffer.append(baseOption.getOneLetterCode());
198                  }
199               }
200            }
201
202            if (getIgnoreGaps())
203            {
204               positionBuffer.append("\\-");
205            }
206
207            positionBuffer.append("]");
208
209            if (getIgnoreGaps()
210                && countSpec != null)
211            {
212               positionBuffer.insert(0, "(?:\\-*");
213               positionBuffer.append(")");
214            }
215
216            position = positionBuffer.toString();
217         }
218         else if (position.startsWith("[")
219                  && position.endsWith("]"))
220         {
221            StringBuilder positionBuffer = new StringBuilder("[");
222            for (int i = 1; i < position.length() - 1; i++)
223            {
224               char theChar = position.charAt(i);
225
226               Nucleotide base = Nucleotide.valueOf(theChar);
227               if (null == base
228                   && theChar != '<'
229                   && theChar != '>')
230               {
231                  throw new RuntimeException("Invalid base " + StringUtil.singleQuote(theChar) + " found in the nucleotide pattern!");
232               }
233
234               positionBuffer.append(theChar);
235               if (base != null
236                   && base.isAmbiguous())
237               {
238                  for (Nucleotide baseOption : base.getDegeneracy())
239                  {
240                     positionBuffer.append(baseOption.getOneLetterCode());
241                  }
242               }
243            }
244
245            positionBuffer.append("]");
246
247            if (getIgnoreGaps()
248                && countSpec != null)
249            {
250               positionBuffer.insert(0, "(?:\\-*");
251               positionBuffer.append(")");
252            }
253
254            position = positionBuffer.toString();
255         }
256         else
257         {
258            if (position.length() > 1)
259            {
260               throw new RuntimeException("Prosite position found with multiple required residues: " + StringUtil.singleQuote(position) + "!");
261            }
262
263            char residue = position.charAt(0);
264            Nucleotide base = Nucleotide.valueOf(residue);
265            if (null == base
266                && ! position.equals("<")
267                && ! position.equals(">"))
268            {
269               throw new RuntimeException("Invalid base " + StringUtil.singleQuote(residue) + " found in the nucleotide pattern!");
270            }
271
272            if (base != null
273                && base.isAmbiguous())
274            {
275               if (position.equalsIgnoreCase("N"))
276               {
277                  if (getIgnoreGaps())
278                  {
279                     position = "[^\\-]";
280                  }
281                  else
282                  {
283                     position = ".";
284                  }
285               }
286               else
287               {
288                  StringBuilder positionBuffer = new StringBuilder("[");
289                  positionBuffer.append(residue);
290
291                  for (Nucleotide baseOption : base.getDegeneracy())
292                  {
293                     positionBuffer.append(baseOption.getOneLetterCode());
294                  }
295
296                  positionBuffer.append("]");
297
298                  position = positionBuffer.toString();
299               }
300            }
301
302
303            if (getIgnoreGaps()
304                && countSpec != null)
305            {
306               position = "(?:\\-*" + position + ")";
307            }
308         }
309
310
311         if (position.contains("<"))
312         {
313            position = "(?:" + position.replace("<", "") + "|\\A)";
314         }
315         else if (position.contains(">"))
316         {
317            position = "(?:" + position.replace(">", "") + "|\\Z)";
318         }
319
320
321         if (getIgnoreGaps()
322             && countSpec != null)
323         {
324            regexp.append((nTerm ? "^" : "")
325                          + position
326                          + countSpec
327                          + (cTerm ? "$" : ""));
328         }
329         else
330         {
331            regexp.delimitedAppend((nTerm ? "^" : "")
332                                   + position
333                                   + (countSpec != null ? countSpec : "")
334                                   + (cTerm ? "$" : ""));
335         }
336      }
337
338      return regexp.toString();
339   }
340}