001package com.hfg.bio.seq.alignment;
002
003import java.util.HashMap;
004import java.util.List;
005import java.util.Map;
006
007import com.hfg.bio.phylogeny.DistanceMatrix;
008import com.hfg.bio.phylogeny.PhylogeneticTree;
009import com.hfg.bio.phylogeny.PhyloNode;
010import com.hfg.bio.phylogeny.UPGMA;
011import com.hfg.bio.seq.BioSequence;
012import com.hfg.bio.seq.Protein;
013import com.hfg.bio.seq.alignment.matrix.PSSM;
014import com.hfg.network.Edge;
015import com.hfg.util.collection.CollectionUtil;
016
017//------------------------------------------------------------------------------
018/**
019 * A basic multiple sequence aligner that uses an initial 3-mer analysis to produce
020 * a draft guide tree and then performs progressive pairwise alignments.
021 *
022 * @author J. Alex Taylor, hairyfatguy.com
023 */
024//------------------------------------------------------------------------------
025// com.hfg XML/HTML Coding Library
026//
027// This library is free software; you can redistribute it and/or
028// modify it under the terms of the GNU Lesser General Public
029// License as published by the Free Software Foundation; either
030// version 2.1 of the License, or (at your option) any later version.
031//
032// This library is distributed in the hope that it will be useful,
033// but WITHOUT ANY WARRANTY; without even the implied warranty of
034// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
035// Lesser General Public License for more details.
036//
037// You should have received a copy of the GNU Lesser General Public
038// License along with this library; if not, write to the Free Software
039// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
040//
041// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
042// jataylor@hairyfatguy.com
043//------------------------------------------------------------------------------
044
045public class MultipleSequenceAligner<T extends BioSequence>
046{
047   private MSA_Settings mSettings;
048
049   //###########################################################################
050   // CONSTRUCTORS
051   //###########################################################################
052
053   //---------------------------------------------------------------------------
054   public MultipleSequenceAligner()
055   {
056      mSettings = generateDefaultSettings();
057   }
058
059   //---------------------------------------------------------------------------
060   public MultipleSequenceAligner(MSA_Settings inSettings)
061   {
062      setSettings(inSettings != null ? inSettings : generateDefaultSettings());
063   }
064
065   //###########################################################################
066   // PUBLIC METHODS
067   //###########################################################################
068
069   //---------------------------------------------------------------------------
070   public MSA_Settings getSettings()
071   {
072      return mSettings;
073   }
074
075   //---------------------------------------------------------------------------
076   public MultipleSequenceAligner<T> setSettings(MSA_Settings inValue)
077   {
078      mSettings = inValue;
079      return this;
080   }
081
082
083   //---------------------------------------------------------------------------
084   public MultipleSequenceAlignment<T> align(List<Protein> inSequences)
085
086   {
087      MultipleSequenceAlignment<T> msa = new MultipleSequenceAlignment<>();
088
089      Map<String, Protein> seqLookupMap = new HashMap<>(inSequences.size());
090      for (Protein sequence : inSequences)
091      {
092         seqLookupMap.put(sequence.getID(), sequence);
093      }
094
095      // Perform a k-mer analysis to produce an estimated distance matrix
096      DistanceMatrix distanceMatrix = new KMerSimilarity().generateKMerDistanceMatrix(inSequences, getSettings().getKMerSize());
097
098      // Create a draft tree from the distance matrix
099      PhylogeneticTree draftTree = new UPGMA().constructTree(distanceMatrix);
100
101
102      PairwiseSeqAligner aligner = new PairwiseSeqAligner();
103      aligner.getSettings()
104            .setAlignmentType(PairwiseAlignmentType.GLOBAL)
105            .setSubstitutionMatrix(getSettings().getSubstitutionMatrix(inSequences.get(0).getType()));
106
107      for (PairwiseSeqType type : PairwiseSeqType.values())
108      {
109         for (PairwiseSeqTerminus terminus : PairwiseSeqTerminus.values())
110         {
111            aligner.getSettings()
112                  .setPenalizeTerminalGaps(type, terminus, getSettings().getPenalizeTerminalGaps(type, terminus));
113         }
114
115         aligner.getSettings()
116               .setGapPenalties(type, getSettings().getGapPenalties(type));
117      }
118
119      // Align the 2 most closely related sequences first
120      Edge<PhyloNode> closestLeafPair = draftTree.getClosestLeafPair();
121      List<PairwiseSeqAlignment> alignments = aligner.align(seqLookupMap.get(closestLeafPair.getFrom().getLabel()),
122                                                            seqLookupMap.get(closestLeafPair.getTo().getLabel()));
123      if (CollectionUtil.hasValues(alignments))
124      {
125         msa.addSequence((T) seqLookupMap.get(alignments.get(0).getAlignedQuery().getSeq().getID()).clone()
126               .setSequence(alignments.get(0).getAlignedQuery().getAlignedSeq()));
127
128         msa.addSequence((T) seqLookupMap.get(alignments.get(0).getAlignedSubject().getSeq().getID()).clone()
129               .setSequence(alignments.get(0).getAlignedSubject().getAlignedSeq()));
130
131         draftTree.removeNode(closestLeafPair.getFrom());
132
133         List<PhyloNode> leaves = draftTree.getLeafNodes();
134
135         PhyloNode lastLeaf = closestLeafPair.getTo();
136
137         // Continue retrieving the next closest sequence and adding it to the msa
138         while (leaves.size() > 1)
139         {
140            PhyloNode nextLeaf = draftTree.getClosestLeaf(lastLeaf);
141
142            draftTree.removeNode(lastLeaf);
143            leaves.remove(lastLeaf);
144
145            PSSM pssm = new PSSM(msa);
146
147            alignments = aligner.align(seqLookupMap.get(nextLeaf.getLabel()), pssm);
148            if (CollectionUtil.hasValues(alignments))
149            {
150               PairwiseSeqAlignment alignment = alignments.get(0);
151               // Any inserts for the existing sequences?
152               String alignedMSA = alignment.getAlignedSubject().getAlignedSeq();
153               if (alignedMSA.contains("-"))
154               {
155                  for (int i = 0; i < alignedMSA.length(); i++)
156                  {
157                     char residue = alignedMSA.charAt(i);
158                     if (residue == '-')
159                     {
160                        msa.addInsert(i);
161                     }
162                  }
163               }
164
165               msa.addSequence((T) seqLookupMap.get(alignment.getAlignedQuery().getSeq().getID()).clone()
166                            .setSequence(alignment.getAlignedQuery().getAlignedSeq()));
167            }
168
169            lastLeaf = nextLeaf;
170         }
171      }
172
173      // TODO: Use the msa to build a new distance matrix and refine the alignment
174
175      return msa;
176   }
177
178   //###########################################################################
179   // PRIVATE METHODS
180   //###########################################################################
181
182   //---------------------------------------------------------------------------
183   private MSA_Settings generateDefaultSettings()
184   {
185      MSA_Settings settings = new MSA_Settings();
186
187      return settings;
188   }
189
190}