001package com.hfg.bio.seq; 002 003 004import com.hfg.util.StringUtil; 005import com.hfg.util.collection.OrderedMap; 006import com.hfg.util.io.StreamUtil; 007 008import java.io.ByteArrayInputStream; 009import java.io.IOException; 010import java.util.Collection; 011import java.util.Map; 012import java.util.regex.Pattern; 013 014//------------------------------------------------------------------------------ 015/** 016 Utility class of generic sequence functions. 017 <div> 018 @author J. Alex Taylor, hairyfatguy.com 019 </div> 020 */ 021//------------------------------------------------------------------------------ 022// com.hfg Library 023// 024// This library is free software; you can redistribute it and/or 025// modify it under the terms of the GNU Lesser General Public 026// License as published by the Free Software Foundation; either 027// version 2.1 of the License, or (at your option) any later version. 028// 029// This library is distributed in the hope that it will be useful, 030// but WITHOUT ANY WARRANTY; without even the implied warranty of 031// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 032// Lesser General Public License for more details. 033// 034// You should have received a copy of the GNU Lesser General Public 035// License along with this library; if not, write to the Free Software 036// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 037// 038// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 039// jataylor@hairyfatguy.com 040//------------------------------------------------------------------------------ 041 042public class SeqUtil 043{ 044 private static final Pattern NON_NUC_PATTERN = Pattern.compile("[^ATUGCN]", Pattern.CASE_INSENSITIVE); 045 046 //-------------------------------------------------------------------------- 047 /** 048 Retains only a single copy of each distinct sequence. 049 @param inSequences the collection of (potentially redundant) input sequences 050 @param <T> BioSequence-implementing class 051 @return the non-redundant collection of sequences 052 */ 053 public static <T extends BioSequence> Collection<T> unique(Collection<T> inSequences) 054 { 055 Map<String, T> uniqueMap = new OrderedMap<String, T>(); 056 for (T seq : inSequences) 057 { 058 byte[] checksum = seq.getSHA1Checksum(); 059 String checksumString = new String(checksum); 060 061 if (! uniqueMap.containsKey(checksumString)) 062 { 063 uniqueMap.put(checksumString, seq); 064 } 065// else 066// { 067// System.out.println(seq.getID() + " is a duplicate of " + uniqueMap.get(checksumString).getID()); 068// } 069 } 070 071 return uniqueMap.values(); 072 } 073 074 //-------------------------------------------------------------------------- 075 /** 076 Guesses the BioSequenceType for the specified sequence. If the sequence is greater than 80% ATUGCN, 077 it will be called as NUCLEIC_ACID. Otherwise it will be called as PROTEIN. 078 @param inSequence the sequence to guess the sequence type from 079 @return the BioSequenceType of the input sequence 080 */ 081 public static BioSequenceType guessBioSequenceType(String inSequence) 082 { 083 BioSequenceType guessedType = BioSequenceType.PROTEIN; 084 085 // Is the sequence greater that 80% ATGCN? 086 if (StringUtil.replaceAllRegexp(inSequence, NON_NUC_PATTERN, "").length() > (0.8 * inSequence.length())) 087 { 088 guessedType = BioSequenceType.NUCLEIC_ACID; 089 } 090 091 return guessedType; 092 } 093 094 095 //-------------------------------------------------------------------------- 096 public static String getReverseComplementSequence(String inNucleicAcidSeq) 097 { 098 String outRevCompSeq; 099 100 try 101 { 102 outRevCompSeq = StreamUtil.inputStreamToString(new NucleicAcidComplementFilterInputStream(new ByteArrayInputStream(new StringBuilder(inNucleicAcidSeq).reverse().toString().getBytes()))); 103 } 104 catch (IOException e) 105 { 106 throw new RuntimeException(e); 107 } 108 109 return outRevCompSeq; 110 } 111 112 //-------------------------------------------------------------------------- 113 /** 114 * Creates a new SeqLocation that is relative to the opposite strand. 115 * Ex: [2, 6] on the reverse strand of a nucleotide sequence of length 10, 116 * would be converted to a forward strand relative location of [9, 5]. 117 * @param inSeqLoc the initial sequence location 118 * @param inNucleotideSeqLength the length of the nucleotide sequence containing the location 119 * @return a SeqLocation relative to the opposite strand 120 */ 121 public static SeqLocation flipStrandSeqLocation(SeqLocation inSeqLoc, int inNucleotideSeqLength) 122 { 123 SeqLocation fwdRelativeSeqLoc = new SeqLocation() 124 .setChainId(inSeqLoc.getChainId()) 125 .setStart(inNucleotideSeqLength - inSeqLoc.getStart() + 1) 126 .setEnd(inNucleotideSeqLength - inSeqLoc.getEnd() + 1); 127 128 return fwdRelativeSeqLoc; 129 } 130}