001package com.hfg.bio.seq;
002
003
004import com.hfg.util.StringUtil;
005import com.hfg.util.collection.OrderedMap;
006import com.hfg.util.io.StreamUtil;
007
008import java.io.ByteArrayInputStream;
009import java.io.IOException;
010import java.util.Collection;
011import java.util.Map;
012import java.util.regex.Pattern;
013
014//------------------------------------------------------------------------------
015/**
016 Utility class of generic sequence functions.
017 <div>
018  @author J. Alex Taylor, hairyfatguy.com
019 </div>
020 */
021//------------------------------------------------------------------------------
022// com.hfg Library
023//
024// This library is free software; you can redistribute it and/or
025// modify it under the terms of the GNU Lesser General Public
026// License as published by the Free Software Foundation; either
027// version 2.1 of the License, or (at your option) any later version.
028//
029// This library is distributed in the hope that it will be useful,
030// but WITHOUT ANY WARRANTY; without even the implied warranty of
031// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
032// Lesser General Public License for more details.
033//
034// You should have received a copy of the GNU Lesser General Public
035// License along with this library; if not, write to the Free Software
036// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
037//
038// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
039// jataylor@hairyfatguy.com
040//------------------------------------------------------------------------------
041
042public class SeqUtil
043{
044   private static final Pattern NON_NUC_PATTERN = Pattern.compile("[^ATUGCN]", Pattern.CASE_INSENSITIVE);
045
046   //--------------------------------------------------------------------------
047   /**
048    Retains only a single copy of each distinct sequence.
049    @param inSequences the collection of (potentially redundant) input sequences
050    @param <T> BioSequence-implementing class
051    @return the non-redundant collection of sequences
052    */
053   public static <T extends BioSequence> Collection<T> unique(Collection<T> inSequences)
054   {
055      Map<String, T> uniqueMap = new OrderedMap<String, T>();
056      for (T seq : inSequences)
057      {
058         byte[] checksum = seq.getSHA1Checksum();
059         String checksumString = new String(checksum);
060
061         if (! uniqueMap.containsKey(checksumString))
062         {
063            uniqueMap.put(checksumString, seq);
064         }
065//         else
066//        {
067//            System.out.println(seq.getID() + " is a duplicate of " + uniqueMap.get(checksumString).getID());
068//         }
069      }
070
071      return uniqueMap.values();
072   }
073
074   //--------------------------------------------------------------------------
075   /**
076    Guesses the BioSequenceType for the specified sequence. If the sequence is greater than 80% ATUGCN,
077    it will be called as NUCLEIC_ACID. Otherwise it will be called as PROTEIN.
078    @param inSequence the sequence to guess the sequence type from
079    @return the BioSequenceType of the input sequence
080    */
081   public static BioSequenceType guessBioSequenceType(String inSequence)
082   {
083      BioSequenceType guessedType = BioSequenceType.PROTEIN;
084
085      // Is the sequence greater that 80% ATGCN?
086      if (StringUtil.replaceAllRegexp(inSequence, NON_NUC_PATTERN, "").length() > (0.8 * inSequence.length()))
087      {
088         guessedType = BioSequenceType.NUCLEIC_ACID;
089      }
090
091      return guessedType;
092   }
093
094
095   //--------------------------------------------------------------------------
096   public static String getReverseComplementSequence(String inNucleicAcidSeq)
097   {
098      String outRevCompSeq;
099
100      try
101      {
102         outRevCompSeq = StreamUtil.inputStreamToString(new NucleicAcidComplementFilterInputStream(new ByteArrayInputStream(new StringBuilder(inNucleicAcidSeq).reverse().toString().getBytes())));
103      }
104      catch (IOException e)
105      {
106         throw new RuntimeException(e);
107      }
108
109      return outRevCompSeq;
110   }
111
112   //--------------------------------------------------------------------------
113   /**
114    * Creates a new SeqLocation that is relative to the opposite strand.
115    * Ex: [2, 6] on the reverse strand of a nucleotide sequence of length 10,
116    * would be converted to a forward strand relative location of [9, 5].
117    * @param inSeqLoc the initial sequence location
118    * @param inNucleotideSeqLength the length of the nucleotide sequence containing the location
119    * @return  a SeqLocation relative to the opposite strand
120    */
121   public static SeqLocation flipStrandSeqLocation(SeqLocation inSeqLoc, int inNucleotideSeqLength)
122   {
123      SeqLocation fwdRelativeSeqLoc = new SeqLocation()
124            .setChainId(inSeqLoc.getChainId())
125            .setStart(inNucleotideSeqLength - inSeqLoc.getStart() + 1)
126            .setEnd(inNucleotideSeqLength - inSeqLoc.getEnd() + 1);
127
128      return fwdRelativeSeqLoc;
129   }
130}