001package com.hfg.bio.seq.alignment;
002
003import java.util.*;
004
005import com.hfg.bio.phylogeny.DistanceMatrix;
006import com.hfg.bio.phylogeny.DistanceMatrixModel;
007import com.hfg.bio.phylogeny.UncorrectedModel;
008import com.hfg.bio.seq.PositionalFrequencyMatrix;
009import com.hfg.bio.seq.BioSequence;
010import com.hfg.bio.seq.BioSequenceType;
011import com.hfg.exception.ProgrammingException;
012import com.hfg.network.Edge;
013import com.hfg.util.StringUtil;
014import com.hfg.util.collection.CollectionUtil;
015import com.hfg.util.collection.DataTable;
016import com.hfg.util.collection.SparseMatrix;
017
018//------------------------------------------------------------------------------
019/**
020 Container for aligned sequences.
021 <div>
022  @author J. Alex Taylor, hairyfatguy.com
023 </div>
024 */
025//------------------------------------------------------------------------------
026// com.hfg Library
027//
028// This library is free software; you can redistribute it and/or
029// modify it under the terms of the GNU Lesser General Public
030// License as published by the Free Software Foundation; either
031// version 2.1 of the License, or (at your option) any later version.
032//
033// This library is distributed in the hope that it will be useful,
034// but WITHOUT ANY WARRANTY; without even the implied warranty of
035// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
036// Lesser General Public License for more details.
037//
038// You should have received a copy of the GNU Lesser General Public
039// License along with this library; if not, write to the Free Software
040// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
041//
042// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
043// jataylor@hairyfatguy.com
044//------------------------------------------------------------------------------
045
046public class MultipleSequenceAlignment<T extends BioSequence> implements Cloneable
047{
048   //**************************************************************************
049   // PRIVATE FIELDS
050   //**************************************************************************
051
052   private String          mId;
053   private String          mTitle;
054   private List<T>         mAlignedSeqs = new ArrayList<>();
055   private int             mAlignmentLength;
056   private BioSequenceType mBioSequenceType;
057   private PositionalFrequencyMatrix<T> mPositionFreqMatrix;
058   private DataTable       mDataTable;
059   private Map<String, Object> mAttributes;
060
061   //**************************************************************************
062   // CONSTRUCTORS
063   //**************************************************************************
064
065   //---------------------------------------------------------------------------
066   public MultipleSequenceAlignment()
067   {
068
069   }
070
071   //---------------------------------------------------------------------------
072   public MultipleSequenceAlignment(Collection<T> inAlignedSeqs)
073   {
074       if (inAlignedSeqs != null)
075       {
076          for (T seq : inAlignedSeqs)
077          {
078             addSequence(seq);
079          }
080       }
081   }
082
083   //**************************************************************************
084   // PUBLIC METHODS
085   //**************************************************************************
086
087   //---------------------------------------------------------------------------
088   public MultipleSequenceAlignment<T> setId(String inValue)
089   {
090      mId = inValue;
091
092      return this;
093   }
094
095   //---------------------------------------------------------------------------
096   public String getId()
097   {
098      return mId;
099   }
100
101
102   //---------------------------------------------------------------------------
103   public MultipleSequenceAlignment<T> setTitle(String inValue)
104   {
105      mTitle = inValue;
106
107      return this;
108   }
109
110   //---------------------------------------------------------------------------
111   public String getTitle()
112   {
113      return mTitle;
114   }
115
116
117   //---------------------------------------------------------------------------
118   public MultipleSequenceAlignment<T> setDataTable(DataTable inValue)
119   {
120      mDataTable = inValue;
121
122      return this;
123   }
124
125   //---------------------------------------------------------------------------
126   public DataTable getDataTable()
127   {
128      return mDataTable;
129   }
130
131
132   //---------------------------------------------------------------------------
133   @Override
134   public MultipleSequenceAlignment<T> clone()
135   {
136      MultipleSequenceAlignment<T> cloneObj;
137      try
138      {
139         cloneObj = (MultipleSequenceAlignment<T>) super.clone();
140      }
141      catch (CloneNotSupportedException e)
142      {
143         throw new ProgrammingException(e);
144      }
145
146      if (mAlignedSeqs != null)
147      {
148         cloneObj.mAlignedSeqs = new ArrayList<>(mAlignedSeqs.size());
149         for (T seq : mAlignedSeqs)
150         {
151            cloneObj.mAlignedSeqs.add((T) seq.clone());
152         }
153      }
154
155      if (mPositionFreqMatrix != null)
156      {
157         cloneObj.mPositionFreqMatrix = mPositionFreqMatrix.clone();
158      }
159
160      return cloneObj;
161   }
162
163   //---------------------------------------------------------------------------
164   public void clearCachedData()
165   {
166      mPositionFreqMatrix = null;
167   }
168
169   //---------------------------------------------------------------------------
170   public MultipleSequenceAlignment<T> subset(Collection<String> inSeqIds)
171   {
172      MultipleSequenceAlignment<T> subset = clone();
173
174      subset.mAlignedSeqs.clear();
175      for (String seqId : inSeqIds)
176      {
177         subset.mAlignedSeqs.add(getSequence(seqId));
178      }
179
180      // Force recalculation of the position freq. data
181      subset.mPositionFreqMatrix = null;
182
183      return subset;
184   }
185
186   //---------------------------------------------------------------------------
187   public void addSequence(T inSeq)
188   {
189      if (inSeq != null)
190      {
191         if (inSeq.length() != mAlignmentLength
192               && CollectionUtil.hasValues(mAlignedSeqs))
193         {
194            throw new RuntimeException(inSeq.getID() + "'s length of "
195                                       + inSeq.length() + " is different from the alignment length ("
196                                       + mAlignmentLength + ")! They must be the same.");
197         }
198
199         mAlignedSeqs.add(inSeq);
200         if (0 == mAlignmentLength) mAlignmentLength = inSeq.length();
201      }
202
203      if (mPositionFreqMatrix != null)
204      {
205         mPositionFreqMatrix.addSequence(inSeq);
206      }
207   }
208
209   //---------------------------------------------------------------------------
210   public List<T> getSequences()
211   {
212      return mAlignedSeqs;
213   }
214
215   //---------------------------------------------------------------------------
216   public T getSequence(String inSeqId)
217   {
218      T requestedSeq = null;
219      for (T seq : getSequences())
220      {
221         if (seq.getID().equals(inSeqId))
222         {
223            requestedSeq = seq;
224            break;
225         }
226      }
227
228      return requestedSeq;
229   }
230
231   //---------------------------------------------------------------------------
232   public void removeSequence(T inSeq)
233   {
234      getSequences().remove(inSeq);
235      clearCachedData();
236   }
237
238   //---------------------------------------------------------------------------
239   public void addInsert(int inIndex)
240   {
241      for (T seq : getSequences())
242      {
243         StringBuilder buffer = new StringBuilder(seq.getSequence());
244         buffer.insert(inIndex, "-");
245         seq.setSequence(buffer);
246      }
247
248      mAlignmentLength++;
249      mPositionFreqMatrix = null;
250   }
251
252   //---------------------------------------------------------------------------
253   public int size()
254   {
255      return (getSequences() != null ? getSequences().size() : 0);
256   }
257
258   //---------------------------------------------------------------------------
259   public BioSequenceType getBioSequenceType()
260   {
261      if (null == mBioSequenceType
262          && CollectionUtil.hasValues(mAlignedSeqs))
263      {
264         mBioSequenceType = mAlignedSeqs.get(0).getType();
265      }
266
267      return mBioSequenceType;
268   }
269
270   //---------------------------------------------------------------------------
271   public int length()
272   {
273      return mAlignmentLength;
274   }
275
276   //---------------------------------------------------------------------------
277   public PositionalFrequencyMatrix<T> getPositionFreqMatrix()
278   {
279      return getPositionFreqMatrix(null);
280   }
281
282   //---------------------------------------------------------------------------
283   public PositionalFrequencyMatrix<T> getPositionFreqMatrix(PositionalFrequencyMatrix.Flag[] inFlags)
284   {
285      if (mPositionFreqMatrix != null)
286      {
287         Set<PositionalFrequencyMatrix.Flag> currentFlags = mPositionFreqMatrix.getFlags();
288
289         boolean flagsDiffer = true;
290         if (null == inFlags)
291         {
292            if (! CollectionUtil.hasValues(currentFlags))
293            {
294               flagsDiffer = false;
295            }
296         }
297         else if (CollectionUtil.hasValues(currentFlags)
298                  && inFlags.length == currentFlags.size())
299         {
300            for (PositionalFrequencyMatrix.Flag flag : inFlags)
301            {
302               flagsDiffer = false;
303               if (! currentFlags.contains(flag))
304               {
305                  flagsDiffer = true;
306                  break;
307               }
308            }
309         }
310
311         if (flagsDiffer)
312         {
313            mPositionFreqMatrix = new PositionalFrequencyMatrix<T>(this, inFlags);
314         }
315      }
316      else
317      {
318         if (CollectionUtil.hasValues(getSequences()))
319         {
320            mPositionFreqMatrix = new PositionalFrequencyMatrix<T>(this, inFlags);
321         }
322      }
323
324      return mPositionFreqMatrix;
325   }
326/*
327   //---------------------------------------------------------------------------
328   public SparseMatrix<Character, Integer, Float> getPositionProbabilityMatrix()
329   {
330      SparseMatrix<Character, Integer, Float> probabilityMatrix = null;
331      int size = size();
332
333      PositionalFrequencyMatrix<T> freqMatrix = getPositionFreqMatrix();
334      if (freqMatrix != null)
335      {
336         probabilityMatrix = new SparseMatrix<Character, Integer, Float>();
337         for (Character residue : freqMatrix.getResidueKeys())
338         {
339            for (Integer position : freqMatrix.getPositionKeys())
340            {
341               probabilityMatrix.put(residue, position, freqMatrix.getCount(residue, position) / (float) size);
342            }
343         }
344      }
345
346      return probabilityMatrix;
347   }
348*/
349   //---------------------------------------------------------------------------
350   /**
351    Returns a percent identity matrix adjusted for any terminal gaps.
352    @return the generated percent identity matrix
353    */
354   public SparseMatrix<String, String, Float> getPctIdentityMatrix()
355   {
356      int matrixWidth = mAlignedSeqs != null ? mAlignedSeqs.size() + 10 : 10;
357
358      SparseMatrix<String, String, Float> matrix = new SparseMatrix<>(matrixWidth, matrixWidth);
359
360      if (mAlignedSeqs != null)
361      {
362         // Calculate the pct. id between ea. pair of sequences.
363         // (The A-B pct. id is the not necessarily the same as the B-A pct. id.)
364         for (int i = 0; i < mAlignedSeqs.size(); i++)
365         {
366            BioSequence seq1 = mAlignedSeqs.get(i);
367
368            matrix.put(seq1.getID(), seq1.getID(), 100f); // Identity diagonal
369
370            for (int j = i + 1; j < mAlignedSeqs.size(); j++)
371            {
372               BioSequence seq2 = mAlignedSeqs.get(j);
373
374               AlignedQuery query = new AlignedQuery(seq1, seq1.getSequence(), 1);
375               AlignedSubject subject = new AlignedSubject(seq2, seq2.getSequence(), 1);
376               PairwiseSeqAlignment pairwiseSeqAlignment = new PairwiseSeqAlignment(query, subject);
377               matrix.put(seq1.getID(), seq2.getID(), pairwiseSeqAlignment.getAdjustedPctIdentity());
378
379               // Invert
380               query = new AlignedQuery(seq2, seq2.getSequence(), 1);
381               subject = new AlignedSubject(seq1, seq1.getSequence(), 1);
382               pairwiseSeqAlignment = new PairwiseSeqAlignment(query, subject);
383               matrix.put(seq2.getID(), seq1.getID(), pairwiseSeqAlignment.getAdjustedPctIdentity());
384            }
385         }
386      }
387
388      return matrix;
389   }
390   //---------------------------------------------------------------------------
391   /**
392    Returns a distance matrix using the specified model. For a simple distance matrix
393    based on mismatches and without any evolutionary compensation, use the UncorrectedModel.
394    @param inAlgorithm the distance matrix mode to use when calculating the distance matrix
395    @return the generated DistanceMatrix
396    */
397   public DistanceMatrix getDistanceMatrix(DistanceMatrixModel inAlgorithm)
398   {
399      if (null == inAlgorithm)
400      {
401         throw new RuntimeException("A DistanceMatrixAlgorithm must be specified!");
402      }
403
404      DistanceMatrix matrix = new DistanceMatrix(mAlignedSeqs != null ? mAlignedSeqs.size() + 10 : 10);
405
406      if (mAlignedSeqs != null)
407      {
408         // Calculate the distance between ea. pair of sequences.
409         // (The A-B distance is the same as the B-A distance.)
410         for (int i = 0; i < mAlignedSeqs.size() - 1; i++)
411         {
412            BioSequence seq1 = mAlignedSeqs.get(i);
413            for (int j = i + 1; j < mAlignedSeqs.size(); j++)
414            {
415               BioSequence seq2 = mAlignedSeqs.get(j);
416
417               matrix.setDistance(seq1.getID(), seq2.getID(), inAlgorithm.calculateDistance(seq1, seq2));
418            }
419         }
420      }
421
422      return matrix;
423   }
424
425
426   //---------------------------------------------------------------------------
427   public void orderByDistanceTo(String inSeqID)
428   {
429      if (null == getSequence(inSeqID))
430      {
431         throw new RuntimeException("The MSA does not contain a sequence with id " + StringUtil.singleQuote(inSeqID) + "!");
432      }
433
434      DistanceMatrix matrix = getDistanceMatrix(new UncorrectedModel());
435
436      List<Edge<String>> sortedEdges = matrix.getSortedEdges(inSeqID);
437
438      List<T> resortedAlignedSeqs = new ArrayList<T>(getSequences().size());
439      for (Edge<String> edge : sortedEdges)
440      {
441         resortedAlignedSeqs.add(getSequence(edge.getTo()));
442      }
443
444      mAlignedSeqs = resortedAlignedSeqs;
445   }
446
447   //---------------------------------------------------------------------------
448   public List<Character> getPositionResidues(int inPosition)
449   {
450      List<Character> positionResidues = new ArrayList<>(size());
451      for (T seq : getSequences())
452      {
453         positionResidues.add(seq.residueAt(inPosition));
454      }
455
456      return positionResidues;
457   }
458
459   //---------------------------------------------------------------------------
460   public Set<Character> getPositionResidueSet(int inPosition)
461   {
462      Set<Character> positionResidues = new HashSet<>(20);
463      for (T seq : getSequences())
464      {
465         positionResidues.add(seq.residueAt(inPosition));
466      }
467
468      return positionResidues;
469   }
470
471
472   //--------------------------------------------------------------------------
473   public void setAttribute(String inName, Object inValue)
474   {
475      if (null == mAttributes)
476      {
477         mAttributes = new HashMap<>();
478      }
479
480      mAttributes.put(inName, inValue);
481   }
482
483   //--------------------------------------------------------------------------
484   public boolean hasAttribute(String inName)
485   {
486      return mAttributes != null && mAttributes.containsKey(inName);
487   }
488
489   //--------------------------------------------------------------------------
490   public Object getAttribute(String inName)
491   {
492      Object attr = null;
493      if (mAttributes != null)
494      {
495         attr = mAttributes.get(inName);
496      }
497
498      return attr;
499   }
500
501   //--------------------------------------------------------------------------
502   public Collection<String> getAttributeNames()
503   {
504      Collection<String> attrNames = null;
505      if (mAttributes != null)
506      {
507         attrNames = mAttributes.keySet();
508      }
509
510      return attrNames;
511   }
512
513   //--------------------------------------------------------------------------
514   public void clearAttributes()
515   {
516      if (mAttributes != null)
517      {
518         mAttributes.clear();
519      }
520   }
521
522   //--------------------------------------------------------------------------
523   public Object removeAttribute(String inName)
524   {
525      Object attr = null;
526      if (mAttributes != null)
527      {
528         attr  = mAttributes.remove(inName);
529      }
530
531      return attr;
532   }
533
534   //**************************************************************************
535   // PROTECTED METHODS
536   //**************************************************************************
537
538   //---------------------------------------------------------------------------
539   protected void setLength(int inValue)
540   {
541      mAlignmentLength = inValue;
542   }
543}