001package com.hfg.bio.seq.alignment; 002 003import java.util.BitSet; 004 005import com.hfg.bio.seq.BioSequence; 006import com.hfg.bio.seq.BioSequenceType; 007 008//------------------------------------------------------------------------------ 009/** 010 * BitSet container for holding detected k-mer instances from a sequence. 011 * 012 * @author J. Alex Taylor, hairyfatguy.com 013 */ 014//------------------------------------------------------------------------------ 015// com.hfg XML/HTML Coding Library 016// 017// This library is free software; you can redistribute it and/or 018// modify it under the terms of the GNU Lesser General Public 019// License as published by the Free Software Foundation; either 020// version 2.1 of the License, or (at your option) any later version. 021// 022// This library is distributed in the hope that it will be useful, 023// but WITHOUT ANY WARRANTY; without even the implied warranty of 024// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 025// Lesser General Public License for more details. 026// 027// You should have received a copy of the GNU Lesser General Public 028// License along with this library; if not, write to the Free Software 029// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 030// 031// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 032// jataylor@hairyfatguy.com 033//------------------------------------------------------------------------------ 034 035public class KMerBitSet extends BitSet 036{ 037 private BioSequenceType mSeqType; 038 private int mKMerSize; 039 private int[] mResidueIndexLookup; 040 private int mUnknownResidueIndex; 041 042 private static String sProteinAlphabet = "ACDEFGHIKLMNPQRSTVWXY"; 043 private static String sNucleicAcidAlphabet = "ACGNT"; 044 045 //-------------------------------------------------------------------------- 046 public KMerBitSet(BioSequenceType inSeqType, int inKMerSize) 047 { 048 super((int) Math.pow(inSeqType.equals(BioSequenceType.PROTEIN) ? 21 : 5, inKMerSize)); 049 mSeqType = inSeqType; 050 mKMerSize = inKMerSize; 051 052 String alphabet = getAlphabet(); 053 mResidueIndexLookup = new int[(int) alphabet.charAt(alphabet.length() - 1) + 1]; 054 for (int i = 0; i < mResidueIndexLookup.length; i++) 055 { 056 mResidueIndexLookup[i] = -1; 057 } 058 059 for (int i = 0; i < alphabet.length(); i++) 060 { 061 char theChar = alphabet.charAt(i); 062 mResidueIndexLookup[(int) theChar] = i; 063 } 064 065 mUnknownResidueIndex = mResidueIndexLookup[(int) getUnknownResidueChar()]; 066 } 067 068 //-------------------------------------------------------------------------- 069 public BioSequenceType getBioSequenceType() 070 { 071 return mSeqType; 072 } 073 074 //-------------------------------------------------------------------------- 075 public void fill(BioSequence inSequence) 076 { 077 byte[] seqIndices = getResidueIndicesForSequence(inSequence.getSequence()); 078 079 int alphabetSize = getAlphabet().length(); 080 081 for (int i = 0; i <= seqIndices.length - mKMerSize; i++) 082 { 083 int charPosition = 0; 084 int bitIndex = 0; 085 int pow = mKMerSize - 1; 086 while (charPosition < mKMerSize) 087 { 088 bitIndex += seqIndices[i + charPosition++] * Math.pow(alphabetSize, pow--); 089 } 090 set(bitIndex); 091 } 092 } 093 094 //--------------------------------------------------------------------------- 095 public int getCommonKMerCount(KMerBitSet inComparisonKMerBitSet) 096 { 097 int count = 0; 098 for (int i = 0; i < size(); i++) 099 { 100 if (get(i) && inComparisonKMerBitSet.get(i)) 101 { 102 count++; 103 } 104 } 105 106 return count; 107 } 108 109 //--------------------------------------------------------------------------- 110 private String getAlphabet() 111 { 112 return (getBioSequenceType().equals(BioSequenceType.PROTEIN) ? sProteinAlphabet : sNucleicAcidAlphabet); 113 } 114 115 //--------------------------------------------------------------------------- 116 private char getUnknownResidueChar() 117 { 118 return (getBioSequenceType().equals(BioSequenceType.PROTEIN) ? 'X' : 'N'); 119 } 120 121 //--------------------------------------------------------------------------- 122 private byte[] getResidueIndicesForSequence(String inSequence) 123 { 124 int length = inSequence.length(); 125 byte[] indices = new byte[length]; 126 127 String ucSequence = inSequence.toUpperCase(); 128 for (int i = 0; i < length; i++) 129 { 130 char theChar = ucSequence.charAt(i); 131 132 int residueIndex = mResidueIndexLookup[(int) theChar]; 133 if (-1 == residueIndex) 134 { 135 residueIndex = mUnknownResidueIndex; 136 } 137 138 indices[i] = (byte) residueIndex; 139 } 140 141 return indices; 142 } 143 144}