001package com.hfg.bio.seq.alignment;
002
003import java.util.ArrayList;
004import java.util.List;
005
006import com.hfg.bio.phylogeny.DistanceMatrix;
007import com.hfg.bio.seq.BioSequence;
008
009//------------------------------------------------------------------------------
010/**
011 * A basic similarity assessment that uses k-mer analysis.
012 *
013 * @author J. Alex Taylor, hairyfatguy.com
014 */
015//------------------------------------------------------------------------------
016// com.hfg XML/HTML Coding Library
017//
018// This library is free software; you can redistribute it and/or
019// modify it under the terms of the GNU Lesser General Public
020// License as published by the Free Software Foundation; either
021// version 2.1 of the License, or (at your option) any later version.
022//
023// This library is distributed in the hope that it will be useful,
024// but WITHOUT ANY WARRANTY; without even the implied warranty of
025// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
026// Lesser General Public License for more details.
027//
028// You should have received a copy of the GNU Lesser General Public
029// License along with this library; if not, write to the Free Software
030// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
031//
032// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
033// jataylor@hairyfatguy.com
034//------------------------------------------------------------------------------
035
036public class KMerSimilarity
037{
038
039   //---------------------------------------------------------------------------
040   public DistanceMatrix generateKMerDistanceMatrix(List<? extends BioSequence> inSequences, int inKMerSize)
041   {
042      List<KMerBitSet> kMerData = new ArrayList<>(inSequences.size());
043      for (BioSequence sequence : inSequences)
044      {
045         KMerBitSet kMerBitSet = new KMerBitSet(sequence.getType(), inKMerSize);
046         kMerBitSet.fill(sequence);
047
048         kMerData.add(kMerBitSet);
049      }
050
051      // Calculate the k-mer "distance" between ea. sequence pair
052      DistanceMatrix distanceMatrix = new DistanceMatrix();
053
054      for (int i = 0; i < kMerData.size() - 1; i++)
055      {
056         BioSequence seq1 = inSequences.get(i);
057         KMerBitSet seq1KMerData = kMerData.get(i);
058         for (int j = 0; j < kMerData.size(); j++)
059         {
060            BioSequence seq2 = inSequences.get(j);
061            KMerBitSet seq2KMerData = kMerData.get(j);
062
063            float sum = seq1KMerData.getCommonKMerCount(seq2KMerData);
064
065            float similarity = sum / (Math.min(seq1.length(), seq2.length()) - inKMerSize + 1);
066
067            distanceMatrix.setDistance(seq1.getID(), seq2.getID(), 1 - similarity);
068         }
069      }
070
071
072      return distanceMatrix;
073   }
074}