001package com.hfg.bio.seq.alignment; 002 003import java.util.HashMap; 004import java.util.List; 005import java.util.Map; 006 007import com.hfg.bio.phylogeny.DistanceMatrix; 008import com.hfg.bio.phylogeny.PhylogeneticTree; 009import com.hfg.bio.phylogeny.PhyloNode; 010import com.hfg.bio.phylogeny.UPGMA; 011import com.hfg.bio.seq.BioSequence; 012import com.hfg.bio.seq.Protein; 013import com.hfg.bio.seq.alignment.matrix.PSSM; 014import com.hfg.network.Edge; 015import com.hfg.util.collection.CollectionUtil; 016 017//------------------------------------------------------------------------------ 018/** 019 * A basic multiple sequence aligner that uses an initial 3-mer analysis to produce 020 * a draft guide tree and then performs progressive pairwise alignments. 021 * 022 * @author J. Alex Taylor, hairyfatguy.com 023 */ 024//------------------------------------------------------------------------------ 025// com.hfg XML/HTML Coding Library 026// 027// This library is free software; you can redistribute it and/or 028// modify it under the terms of the GNU Lesser General Public 029// License as published by the Free Software Foundation; either 030// version 2.1 of the License, or (at your option) any later version. 031// 032// This library is distributed in the hope that it will be useful, 033// but WITHOUT ANY WARRANTY; without even the implied warranty of 034// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 035// Lesser General Public License for more details. 036// 037// You should have received a copy of the GNU Lesser General Public 038// License along with this library; if not, write to the Free Software 039// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 040// 041// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 042// jataylor@hairyfatguy.com 043//------------------------------------------------------------------------------ 044 045public class MultipleSequenceAligner<T extends BioSequence> 046{ 047 private MSA_Settings mSettings; 048 049 //########################################################################### 050 // CONSTRUCTORS 051 //########################################################################### 052 053 //--------------------------------------------------------------------------- 054 public MultipleSequenceAligner() 055 { 056 mSettings = generateDefaultSettings(); 057 } 058 059 //--------------------------------------------------------------------------- 060 public MultipleSequenceAligner(MSA_Settings inSettings) 061 { 062 setSettings(inSettings != null ? inSettings : generateDefaultSettings()); 063 } 064 065 //########################################################################### 066 // PUBLIC METHODS 067 //########################################################################### 068 069 //--------------------------------------------------------------------------- 070 public MSA_Settings getSettings() 071 { 072 return mSettings; 073 } 074 075 //--------------------------------------------------------------------------- 076 public MultipleSequenceAligner<T> setSettings(MSA_Settings inValue) 077 { 078 mSettings = inValue; 079 return this; 080 } 081 082 083 //--------------------------------------------------------------------------- 084 public MultipleSequenceAlignment<T> align(List<Protein> inSequences) 085 086 { 087 MultipleSequenceAlignment<T> msa = new MultipleSequenceAlignment<>(); 088 089 Map<String, Protein> seqLookupMap = new HashMap<>(inSequences.size()); 090 for (Protein sequence : inSequences) 091 { 092 seqLookupMap.put(sequence.getID(), sequence); 093 } 094 095 // Perform a k-mer analysis to produce an estimated distance matrix 096 DistanceMatrix distanceMatrix = new KMerSimilarity().generateKMerDistanceMatrix(inSequences, getSettings().getKMerSize()); 097 098 // Create a draft tree from the distance matrix 099 PhylogeneticTree draftTree = new UPGMA().constructTree(distanceMatrix); 100 101 102 PairwiseSeqAligner aligner = new PairwiseSeqAligner(); 103 aligner.getSettings() 104 .setAlignmentType(PairwiseAlignmentType.GLOBAL) 105 .setSubstitutionMatrix(getSettings().getSubstitutionMatrix(inSequences.get(0).getType())); 106 107 for (PairwiseSeqType type : PairwiseSeqType.values()) 108 { 109 for (PairwiseSeqTerminus terminus : PairwiseSeqTerminus.values()) 110 { 111 aligner.getSettings() 112 .setPenalizeTerminalGaps(type, terminus, getSettings().getPenalizeTerminalGaps(type, terminus)); 113 } 114 115 aligner.getSettings() 116 .setGapPenalties(type, getSettings().getGapPenalties(type)); 117 } 118 119 // Align the 2 most closely related sequences first 120 Edge<PhyloNode> closestLeafPair = draftTree.getClosestLeafPair(); 121 List<PairwiseSeqAlignment> alignments = aligner.align(seqLookupMap.get(closestLeafPair.getFrom().getLabel()), 122 seqLookupMap.get(closestLeafPair.getTo().getLabel())); 123 if (CollectionUtil.hasValues(alignments)) 124 { 125 msa.addSequence((T) seqLookupMap.get(alignments.get(0).getAlignedQuery().getSeq().getID()).clone() 126 .setSequence(alignments.get(0).getAlignedQuery().getAlignedSeq())); 127 128 msa.addSequence((T) seqLookupMap.get(alignments.get(0).getAlignedSubject().getSeq().getID()).clone() 129 .setSequence(alignments.get(0).getAlignedSubject().getAlignedSeq())); 130 131 draftTree.removeNode(closestLeafPair.getFrom()); 132 133 List<PhyloNode> leaves = draftTree.getLeafNodes(); 134 135 PhyloNode lastLeaf = closestLeafPair.getTo(); 136 137 // Continue retrieving the next closest sequence and adding it to the msa 138 while (leaves.size() > 1) 139 { 140 PhyloNode nextLeaf = draftTree.getClosestLeaf(lastLeaf); 141 142 draftTree.removeNode(lastLeaf); 143 leaves.remove(lastLeaf); 144 145 PSSM pssm = new PSSM(msa); 146 147 alignments = aligner.align(seqLookupMap.get(nextLeaf.getLabel()), pssm); 148 if (CollectionUtil.hasValues(alignments)) 149 { 150 PairwiseSeqAlignment alignment = alignments.get(0); 151 // Any inserts for the existing sequences? 152 String alignedMSA = alignment.getAlignedSubject().getAlignedSeq(); 153 if (alignedMSA.contains("-")) 154 { 155 for (int i = 0; i < alignedMSA.length(); i++) 156 { 157 char residue = alignedMSA.charAt(i); 158 if (residue == '-') 159 { 160 msa.addInsert(i); 161 } 162 } 163 } 164 165 msa.addSequence((T) seqLookupMap.get(alignment.getAlignedQuery().getSeq().getID()).clone() 166 .setSequence(alignment.getAlignedQuery().getAlignedSeq())); 167 } 168 169 lastLeaf = nextLeaf; 170 } 171 } 172 173 // TODO: Use the msa to build a new distance matrix and refine the alignment 174 175 return msa; 176 } 177 178 //########################################################################### 179 // PRIVATE METHODS 180 //########################################################################### 181 182 //--------------------------------------------------------------------------- 183 private MSA_Settings generateDefaultSettings() 184 { 185 MSA_Settings settings = new MSA_Settings(); 186 187 return settings; 188 } 189 190}