001package com.hfg.bio.seq.alignment; 002 003import java.util.ArrayList; 004import java.util.List; 005 006import com.hfg.bio.phylogeny.DistanceMatrix; 007import com.hfg.bio.seq.BioSequence; 008 009//------------------------------------------------------------------------------ 010/** 011 * A basic similarity assessment that uses k-mer analysis. 012 * 013 * @author J. Alex Taylor, hairyfatguy.com 014 */ 015//------------------------------------------------------------------------------ 016// com.hfg XML/HTML Coding Library 017// 018// This library is free software; you can redistribute it and/or 019// modify it under the terms of the GNU Lesser General Public 020// License as published by the Free Software Foundation; either 021// version 2.1 of the License, or (at your option) any later version. 022// 023// This library is distributed in the hope that it will be useful, 024// but WITHOUT ANY WARRANTY; without even the implied warranty of 025// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 026// Lesser General Public License for more details. 027// 028// You should have received a copy of the GNU Lesser General Public 029// License along with this library; if not, write to the Free Software 030// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 031// 032// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 033// jataylor@hairyfatguy.com 034//------------------------------------------------------------------------------ 035 036public class KMerSimilarity 037{ 038 039 //--------------------------------------------------------------------------- 040 public DistanceMatrix generateKMerDistanceMatrix(List<? extends BioSequence> inSequences, int inKMerSize) 041 { 042 List<KMerBitSet> kMerData = new ArrayList<>(inSequences.size()); 043 for (BioSequence sequence : inSequences) 044 { 045 KMerBitSet kMerBitSet = new KMerBitSet(sequence.getType(), inKMerSize); 046 kMerBitSet.fill(sequence); 047 048 kMerData.add(kMerBitSet); 049 } 050 051 // Calculate the k-mer "distance" between ea. sequence pair 052 DistanceMatrix distanceMatrix = new DistanceMatrix(); 053 054 for (int i = 0; i < kMerData.size() - 1; i++) 055 { 056 BioSequence seq1 = inSequences.get(i); 057 KMerBitSet seq1KMerData = kMerData.get(i); 058 for (int j = 0; j < kMerData.size(); j++) 059 { 060 BioSequence seq2 = inSequences.get(j); 061 KMerBitSet seq2KMerData = kMerData.get(j); 062 063 float sum = seq1KMerData.getCommonKMerCount(seq2KMerData); 064 065 float similarity = sum / (Math.min(seq1.length(), seq2.length()) - inKMerSize + 1); 066 067 distanceMatrix.setDistance(seq1.getID(), seq2.getID(), 1 - similarity); 068 } 069 } 070 071 072 return distanceMatrix; 073 } 074}