001package com.hfg.bio.seq.alignment; 002 003import java.util.*; 004 005import com.hfg.bio.phylogeny.DistanceMatrix; 006import com.hfg.bio.phylogeny.DistanceMatrixModel; 007import com.hfg.bio.phylogeny.UncorrectedModel; 008import com.hfg.bio.seq.PositionalFrequencyMatrix; 009import com.hfg.bio.seq.BioSequence; 010import com.hfg.bio.seq.BioSequenceType; 011import com.hfg.exception.ProgrammingException; 012import com.hfg.network.Edge; 013import com.hfg.util.StringUtil; 014import com.hfg.util.collection.CollectionUtil; 015import com.hfg.util.collection.DataTable; 016import com.hfg.util.collection.SparseMatrix; 017 018//------------------------------------------------------------------------------ 019/** 020 Container for aligned sequences. 021 <div> 022 @author J. Alex Taylor, hairyfatguy.com 023 </div> 024 */ 025//------------------------------------------------------------------------------ 026// com.hfg Library 027// 028// This library is free software; you can redistribute it and/or 029// modify it under the terms of the GNU Lesser General Public 030// License as published by the Free Software Foundation; either 031// version 2.1 of the License, or (at your option) any later version. 032// 033// This library is distributed in the hope that it will be useful, 034// but WITHOUT ANY WARRANTY; without even the implied warranty of 035// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 036// Lesser General Public License for more details. 037// 038// You should have received a copy of the GNU Lesser General Public 039// License along with this library; if not, write to the Free Software 040// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 041// 042// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 043// jataylor@hairyfatguy.com 044//------------------------------------------------------------------------------ 045 046public class MultipleSequenceAlignment<T extends BioSequence> implements Cloneable 047{ 048 //************************************************************************** 049 // PRIVATE FIELDS 050 //************************************************************************** 051 052 private String mId; 053 private String mTitle; 054 private List<T> mAlignedSeqs = new ArrayList<>(); 055 private int mAlignmentLength; 056 private BioSequenceType mBioSequenceType; 057 private PositionalFrequencyMatrix<T> mPositionFreqMatrix; 058 private DataTable mDataTable; 059 private Map<String, Object> mAttributes; 060 061 //************************************************************************** 062 // CONSTRUCTORS 063 //************************************************************************** 064 065 //--------------------------------------------------------------------------- 066 public MultipleSequenceAlignment() 067 { 068 069 } 070 071 //--------------------------------------------------------------------------- 072 public MultipleSequenceAlignment(Collection<T> inAlignedSeqs) 073 { 074 if (inAlignedSeqs != null) 075 { 076 for (T seq : inAlignedSeqs) 077 { 078 addSequence(seq); 079 } 080 } 081 } 082 083 //************************************************************************** 084 // PUBLIC METHODS 085 //************************************************************************** 086 087 //--------------------------------------------------------------------------- 088 public MultipleSequenceAlignment<T> setId(String inValue) 089 { 090 mId = inValue; 091 092 return this; 093 } 094 095 //--------------------------------------------------------------------------- 096 public String getId() 097 { 098 return mId; 099 } 100 101 102 //--------------------------------------------------------------------------- 103 public MultipleSequenceAlignment<T> setTitle(String inValue) 104 { 105 mTitle = inValue; 106 107 return this; 108 } 109 110 //--------------------------------------------------------------------------- 111 public String getTitle() 112 { 113 return mTitle; 114 } 115 116 117 //--------------------------------------------------------------------------- 118 public MultipleSequenceAlignment<T> setDataTable(DataTable inValue) 119 { 120 mDataTable = inValue; 121 122 return this; 123 } 124 125 //--------------------------------------------------------------------------- 126 public DataTable getDataTable() 127 { 128 return mDataTable; 129 } 130 131 132 //--------------------------------------------------------------------------- 133 @Override 134 public MultipleSequenceAlignment<T> clone() 135 { 136 MultipleSequenceAlignment<T> cloneObj; 137 try 138 { 139 cloneObj = (MultipleSequenceAlignment<T>) super.clone(); 140 } 141 catch (CloneNotSupportedException e) 142 { 143 throw new ProgrammingException(e); 144 } 145 146 if (mAlignedSeqs != null) 147 { 148 cloneObj.mAlignedSeqs = new ArrayList<>(mAlignedSeqs.size()); 149 for (T seq : mAlignedSeqs) 150 { 151 cloneObj.mAlignedSeqs.add((T) seq.clone()); 152 } 153 } 154 155 if (mPositionFreqMatrix != null) 156 { 157 cloneObj.mPositionFreqMatrix = mPositionFreqMatrix.clone(); 158 } 159 160 return cloneObj; 161 } 162 163 //--------------------------------------------------------------------------- 164 public void clearCachedData() 165 { 166 mPositionFreqMatrix = null; 167 } 168 169 //--------------------------------------------------------------------------- 170 public MultipleSequenceAlignment<T> subset(Collection<String> inSeqIds) 171 { 172 MultipleSequenceAlignment<T> subset = clone(); 173 174 subset.mAlignedSeqs.clear(); 175 for (String seqId : inSeqIds) 176 { 177 subset.mAlignedSeqs.add(getSequence(seqId)); 178 } 179 180 // Force recalculation of the position freq. data 181 subset.mPositionFreqMatrix = null; 182 183 return subset; 184 } 185 186 //--------------------------------------------------------------------------- 187 public void addSequence(T inSeq) 188 { 189 if (inSeq != null) 190 { 191 if (inSeq.length() != mAlignmentLength 192 && CollectionUtil.hasValues(mAlignedSeqs)) 193 { 194 throw new RuntimeException(inSeq.getID() + "'s length of " 195 + inSeq.length() + " is different from the alignment length (" 196 + mAlignmentLength + ")! They must be the same."); 197 } 198 199 mAlignedSeqs.add(inSeq); 200 if (0 == mAlignmentLength) mAlignmentLength = inSeq.length(); 201 } 202 203 if (mPositionFreqMatrix != null) 204 { 205 mPositionFreqMatrix.addSequence(inSeq); 206 } 207 } 208 209 //--------------------------------------------------------------------------- 210 public List<T> getSequences() 211 { 212 return mAlignedSeqs; 213 } 214 215 //--------------------------------------------------------------------------- 216 public T getSequence(String inSeqId) 217 { 218 T requestedSeq = null; 219 for (T seq : getSequences()) 220 { 221 if (seq.getID().equals(inSeqId)) 222 { 223 requestedSeq = seq; 224 break; 225 } 226 } 227 228 return requestedSeq; 229 } 230 231 //--------------------------------------------------------------------------- 232 public void removeSequence(T inSeq) 233 { 234 getSequences().remove(inSeq); 235 clearCachedData(); 236 } 237 238 //--------------------------------------------------------------------------- 239 public void addInsert(int inIndex) 240 { 241 for (T seq : getSequences()) 242 { 243 StringBuilder buffer = new StringBuilder(seq.getSequence()); 244 buffer.insert(inIndex, "-"); 245 seq.setSequence(buffer); 246 } 247 248 mAlignmentLength++; 249 mPositionFreqMatrix = null; 250 } 251 252 //--------------------------------------------------------------------------- 253 public int size() 254 { 255 return (getSequences() != null ? getSequences().size() : 0); 256 } 257 258 //--------------------------------------------------------------------------- 259 public BioSequenceType getBioSequenceType() 260 { 261 if (null == mBioSequenceType 262 && CollectionUtil.hasValues(mAlignedSeqs)) 263 { 264 mBioSequenceType = mAlignedSeqs.get(0).getType(); 265 } 266 267 return mBioSequenceType; 268 } 269 270 //--------------------------------------------------------------------------- 271 public int length() 272 { 273 return mAlignmentLength; 274 } 275 276 //--------------------------------------------------------------------------- 277 public PositionalFrequencyMatrix<T> getPositionFreqMatrix() 278 { 279 return getPositionFreqMatrix(null); 280 } 281 282 //--------------------------------------------------------------------------- 283 public PositionalFrequencyMatrix<T> getPositionFreqMatrix(PositionalFrequencyMatrix.Flag[] inFlags) 284 { 285 if (mPositionFreqMatrix != null) 286 { 287 Set<PositionalFrequencyMatrix.Flag> currentFlags = mPositionFreqMatrix.getFlags(); 288 289 boolean flagsDiffer = true; 290 if (null == inFlags) 291 { 292 if (! CollectionUtil.hasValues(currentFlags)) 293 { 294 flagsDiffer = false; 295 } 296 } 297 else if (CollectionUtil.hasValues(currentFlags) 298 && inFlags.length == currentFlags.size()) 299 { 300 for (PositionalFrequencyMatrix.Flag flag : inFlags) 301 { 302 flagsDiffer = false; 303 if (! currentFlags.contains(flag)) 304 { 305 flagsDiffer = true; 306 break; 307 } 308 } 309 } 310 311 if (flagsDiffer) 312 { 313 mPositionFreqMatrix = new PositionalFrequencyMatrix<T>(this, inFlags); 314 } 315 } 316 else 317 { 318 if (CollectionUtil.hasValues(getSequences())) 319 { 320 mPositionFreqMatrix = new PositionalFrequencyMatrix<T>(this, inFlags); 321 } 322 } 323 324 return mPositionFreqMatrix; 325 } 326/* 327 //--------------------------------------------------------------------------- 328 public SparseMatrix<Character, Integer, Float> getPositionProbabilityMatrix() 329 { 330 SparseMatrix<Character, Integer, Float> probabilityMatrix = null; 331 int size = size(); 332 333 PositionalFrequencyMatrix<T> freqMatrix = getPositionFreqMatrix(); 334 if (freqMatrix != null) 335 { 336 probabilityMatrix = new SparseMatrix<Character, Integer, Float>(); 337 for (Character residue : freqMatrix.getResidueKeys()) 338 { 339 for (Integer position : freqMatrix.getPositionKeys()) 340 { 341 probabilityMatrix.put(residue, position, freqMatrix.getCount(residue, position) / (float) size); 342 } 343 } 344 } 345 346 return probabilityMatrix; 347 } 348*/ 349 //--------------------------------------------------------------------------- 350 /** 351 Returns a percent identity matrix adjusted for any terminal gaps. 352 @return the generated percent identity matrix 353 */ 354 public SparseMatrix<String, String, Float> getPctIdentityMatrix() 355 { 356 int matrixWidth = mAlignedSeqs != null ? mAlignedSeqs.size() + 10 : 10; 357 358 SparseMatrix<String, String, Float> matrix = new SparseMatrix<>(matrixWidth, matrixWidth); 359 360 if (mAlignedSeqs != null) 361 { 362 // Calculate the pct. id between ea. pair of sequences. 363 // (The A-B pct. id is the not necessarily the same as the B-A pct. id.) 364 for (int i = 0; i < mAlignedSeqs.size(); i++) 365 { 366 BioSequence seq1 = mAlignedSeqs.get(i); 367 368 matrix.put(seq1.getID(), seq1.getID(), 100f); // Identity diagonal 369 370 for (int j = i + 1; j < mAlignedSeqs.size(); j++) 371 { 372 BioSequence seq2 = mAlignedSeqs.get(j); 373 374 AlignedQuery query = new AlignedQuery(seq1, seq1.getSequence(), 1); 375 AlignedSubject subject = new AlignedSubject(seq2, seq2.getSequence(), 1); 376 PairwiseSeqAlignment pairwiseSeqAlignment = new PairwiseSeqAlignment(query, subject); 377 matrix.put(seq1.getID(), seq2.getID(), pairwiseSeqAlignment.getAdjustedPctIdentity()); 378 379 // Invert 380 query = new AlignedQuery(seq2, seq2.getSequence(), 1); 381 subject = new AlignedSubject(seq1, seq1.getSequence(), 1); 382 pairwiseSeqAlignment = new PairwiseSeqAlignment(query, subject); 383 matrix.put(seq2.getID(), seq1.getID(), pairwiseSeqAlignment.getAdjustedPctIdentity()); 384 } 385 } 386 } 387 388 return matrix; 389 } 390 //--------------------------------------------------------------------------- 391 /** 392 Returns a distance matrix using the specified model. For a simple distance matrix 393 based on mismatches and without any evolutionary compensation, use the UncorrectedModel. 394 @param inAlgorithm the distance matrix mode to use when calculating the distance matrix 395 @return the generated DistanceMatrix 396 */ 397 public DistanceMatrix getDistanceMatrix(DistanceMatrixModel inAlgorithm) 398 { 399 if (null == inAlgorithm) 400 { 401 throw new RuntimeException("A DistanceMatrixAlgorithm must be specified!"); 402 } 403 404 DistanceMatrix matrix = new DistanceMatrix(mAlignedSeqs != null ? mAlignedSeqs.size() + 10 : 10); 405 406 if (mAlignedSeqs != null) 407 { 408 // Calculate the distance between ea. pair of sequences. 409 // (The A-B distance is the same as the B-A distance.) 410 for (int i = 0; i < mAlignedSeqs.size() - 1; i++) 411 { 412 BioSequence seq1 = mAlignedSeqs.get(i); 413 for (int j = i + 1; j < mAlignedSeqs.size(); j++) 414 { 415 BioSequence seq2 = mAlignedSeqs.get(j); 416 417 matrix.setDistance(seq1.getID(), seq2.getID(), inAlgorithm.calculateDistance(seq1, seq2)); 418 } 419 } 420 } 421 422 return matrix; 423 } 424 425 426 //--------------------------------------------------------------------------- 427 public void orderByDistanceTo(String inSeqID) 428 { 429 if (null == getSequence(inSeqID)) 430 { 431 throw new RuntimeException("The MSA does not contain a sequence with id " + StringUtil.singleQuote(inSeqID) + "!"); 432 } 433 434 DistanceMatrix matrix = getDistanceMatrix(new UncorrectedModel()); 435 436 List<Edge<String>> sortedEdges = matrix.getSortedEdges(inSeqID); 437 438 List<T> resortedAlignedSeqs = new ArrayList<T>(getSequences().size()); 439 for (Edge<String> edge : sortedEdges) 440 { 441 resortedAlignedSeqs.add(getSequence(edge.getTo())); 442 } 443 444 mAlignedSeqs = resortedAlignedSeqs; 445 } 446 447 //--------------------------------------------------------------------------- 448 public List<Character> getPositionResidues(int inPosition) 449 { 450 List<Character> positionResidues = new ArrayList<>(size()); 451 for (T seq : getSequences()) 452 { 453 positionResidues.add(seq.residueAt(inPosition)); 454 } 455 456 return positionResidues; 457 } 458 459 //--------------------------------------------------------------------------- 460 public Set<Character> getPositionResidueSet(int inPosition) 461 { 462 Set<Character> positionResidues = new HashSet<>(20); 463 for (T seq : getSequences()) 464 { 465 positionResidues.add(seq.residueAt(inPosition)); 466 } 467 468 return positionResidues; 469 } 470 471 472 //-------------------------------------------------------------------------- 473 public void setAttribute(String inName, Object inValue) 474 { 475 if (null == mAttributes) 476 { 477 mAttributes = new HashMap<>(); 478 } 479 480 mAttributes.put(inName, inValue); 481 } 482 483 //-------------------------------------------------------------------------- 484 public boolean hasAttribute(String inName) 485 { 486 return mAttributes != null && mAttributes.containsKey(inName); 487 } 488 489 //-------------------------------------------------------------------------- 490 public Object getAttribute(String inName) 491 { 492 Object attr = null; 493 if (mAttributes != null) 494 { 495 attr = mAttributes.get(inName); 496 } 497 498 return attr; 499 } 500 501 //-------------------------------------------------------------------------- 502 public Collection<String> getAttributeNames() 503 { 504 Collection<String> attrNames = null; 505 if (mAttributes != null) 506 { 507 attrNames = mAttributes.keySet(); 508 } 509 510 return attrNames; 511 } 512 513 //-------------------------------------------------------------------------- 514 public void clearAttributes() 515 { 516 if (mAttributes != null) 517 { 518 mAttributes.clear(); 519 } 520 } 521 522 //-------------------------------------------------------------------------- 523 public Object removeAttribute(String inName) 524 { 525 Object attr = null; 526 if (mAttributes != null) 527 { 528 attr = mAttributes.remove(inName); 529 } 530 531 return attr; 532 } 533 534 //************************************************************************** 535 // PROTECTED METHODS 536 //************************************************************************** 537 538 //--------------------------------------------------------------------------- 539 protected void setLength(int inValue) 540 { 541 mAlignmentLength = inValue; 542 } 543}