001package com.hfg.bio.seq.alignment;
002
003import java.util.BitSet;
004
005import com.hfg.bio.seq.BioSequence;
006import com.hfg.bio.seq.BioSequenceType;
007
008//------------------------------------------------------------------------------
009/**
010 * BitSet container for holding detected k-mer instances from a sequence.
011 *
012 * @author J. Alex Taylor, hairyfatguy.com
013 */
014//------------------------------------------------------------------------------
015// com.hfg XML/HTML Coding Library
016//
017// This library is free software; you can redistribute it and/or
018// modify it under the terms of the GNU Lesser General Public
019// License as published by the Free Software Foundation; either
020// version 2.1 of the License, or (at your option) any later version.
021//
022// This library is distributed in the hope that it will be useful,
023// but WITHOUT ANY WARRANTY; without even the implied warranty of
024// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
025// Lesser General Public License for more details.
026//
027// You should have received a copy of the GNU Lesser General Public
028// License along with this library; if not, write to the Free Software
029// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
030//
031// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
032// jataylor@hairyfatguy.com
033//------------------------------------------------------------------------------
034
035public class KMerBitSet extends BitSet
036{
037   private BioSequenceType mSeqType;
038   private int mKMerSize;
039   private int[] mResidueIndexLookup;
040   private int mUnknownResidueIndex;
041
042   private static String sProteinAlphabet = "ACDEFGHIKLMNPQRSTVWXY";
043   private static String sNucleicAcidAlphabet = "ACGNT";
044
045   //--------------------------------------------------------------------------
046   public KMerBitSet(BioSequenceType inSeqType, int inKMerSize)
047   {
048      super((int) Math.pow(inSeqType.equals(BioSequenceType.PROTEIN) ? 21 : 5, inKMerSize));
049      mSeqType = inSeqType;
050      mKMerSize = inKMerSize;
051
052      String alphabet = getAlphabet();
053      mResidueIndexLookup = new int[(int) alphabet.charAt(alphabet.length() - 1) + 1];
054      for (int i = 0; i < mResidueIndexLookup.length; i++)
055      {
056         mResidueIndexLookup[i] = -1;
057      }
058
059      for (int i = 0; i < alphabet.length(); i++)
060      {
061         char theChar = alphabet.charAt(i);
062         mResidueIndexLookup[(int) theChar] = i;
063      }
064
065      mUnknownResidueIndex = mResidueIndexLookup[(int) getUnknownResidueChar()];
066   }
067
068   //--------------------------------------------------------------------------
069   public BioSequenceType getBioSequenceType()
070   {
071      return mSeqType;
072   }
073
074   //--------------------------------------------------------------------------
075   public void fill(BioSequence inSequence)
076   {
077      byte[] seqIndices = getResidueIndicesForSequence(inSequence.getSequence());
078
079      int alphabetSize = getAlphabet().length();
080
081      for (int i = 0; i <= seqIndices.length - mKMerSize; i++)
082      {
083         int charPosition = 0;
084         int bitIndex = 0;
085         int pow = mKMerSize - 1;
086         while (charPosition < mKMerSize)
087         {
088            bitIndex += seqIndices[i + charPosition++] * Math.pow(alphabetSize, pow--);
089         }
090         set(bitIndex);
091      }
092   }
093
094   //---------------------------------------------------------------------------
095   public int getCommonKMerCount(KMerBitSet inComparisonKMerBitSet)
096   {
097      int count = 0;
098      for (int i = 0; i < size(); i++)
099      {
100         if (get(i) && inComparisonKMerBitSet.get(i))
101         {
102            count++;
103         }
104      }
105
106      return count;
107   }
108
109   //---------------------------------------------------------------------------
110   private String getAlphabet()
111   {
112      return (getBioSequenceType().equals(BioSequenceType.PROTEIN) ? sProteinAlphabet : sNucleicAcidAlphabet);
113   }
114
115   //---------------------------------------------------------------------------
116   private char getUnknownResidueChar()
117   {
118      return (getBioSequenceType().equals(BioSequenceType.PROTEIN) ? 'X' : 'N');
119   }
120
121   //---------------------------------------------------------------------------
122   private byte[] getResidueIndicesForSequence(String inSequence)
123   {
124      int length = inSequence.length();
125      byte[] indices = new byte[length];
126
127      String ucSequence = inSequence.toUpperCase();
128      for (int i = 0; i < length; i++)
129      {
130         char theChar = ucSequence.charAt(i);
131
132         int residueIndex = mResidueIndexLookup[(int) theChar];
133         if (-1 == residueIndex)
134         {
135            residueIndex = mUnknownResidueIndex;
136         }
137
138         indices[i] = (byte) residueIndex;
139      }
140
141      return indices;
142   }
143
144}