001package com.hfg.bio.seq;
002
003import java.lang.reflect.Constructor;
004import java.util.*;
005import java.io.*;
006import java.util.regex.Matcher;
007import java.util.regex.Pattern;
008
009import com.hfg.bio.*;
010import com.hfg.bio.seq.format.SeqIOException;
011import com.hfg.chem.Element;
012import com.hfg.chem.Molecule;
013import com.hfg.exception.ProgrammingException;
014import com.hfg.exception.UnimplementedMethodException;
015import com.hfg.util.ChecksumUtil;
016import com.hfg.util.io.GZIP;
017import com.hfg.util.io.SegmentReader;
018import com.hfg.util.io.StreamUtil;
019import com.hfg.util.collection.CollectionUtil;
020import com.hfg.util.StringUtil;
021import com.hfg.xml.XMLNode;
022import com.hfg.xml.XMLTag;
023import com.hfg.xml.XMLizable;
024
025
026//------------------------------------------------------------------------------
027/**
028 Biological Protein, DNA, or RNA sequence.
029 <div>
030  @author J. Alex Taylor, hairyfatguy.com
031 </div>
032 */
033//------------------------------------------------------------------------------
034// com.hfg XML/HTML Coding Library
035//
036// This library is free software; you can redistribute it and/or
037// modify it under the terms of the GNU Lesser General Public
038// License as published by the Free Software Foundation; either
039// version 2.1 of the License, or (at your option) any later version.
040//
041// This library is distributed in the hope that it will be useful,
042// but WITHOUT ANY WARRANTY; without even the implied warranty of
043// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
044// Lesser General Public License for more details.
045//
046// You should have received a copy of the GNU Lesser General Public
047// License along with this library; if not, write to the Free Software
048// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
049//
050// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
051// jataylor@hairyfatguy.com
052//------------------------------------------------------------------------------
053// TODO:
054//   - Create a composition filter that will track the composition as the sequence is set.
055//   - Restrict the sequence chars to the residue set.
056
057public abstract class BioSequenceImpl extends Molecule implements Cloneable, BioSequence
058{
059
060   //##########################################################################
061   // PRIVATE FIELDS
062   //##########################################################################
063
064   private String       mID;
065   private String       mDescription;
066   private String       mUncompressedSequence;
067   private List<byte[]> mCompressedSeqChunks;
068   private int          mSeqLength;
069   private Integer      mNumGaps;
070   private Integer      mTotalGapLength;
071
072   private Map<String, Integer> mComposition;
073
074   private byte[]       mMD5Checksum;
075   private byte[]       mSHA1Checksum;
076
077   // How long the sequence should be before compression is used.
078   private int mCompressionThreshold = sDefaultCompressionThreshold;
079
080   // Default sequence compression threshold.
081   private static int   sDefaultCompressionThreshold = 4 * 1024;
082
083   // Pattern of the characters that should get removed when setting the sequence
084//   private static final Pattern SEQUENCE_PURIFICATION_PATTERN = Pattern.compile("[\\s\\d;_'\\\"]");
085   private static final Pattern SEQUENCE_PURIFICATION_PATTERN = Pattern.compile("[^a-z\\*\\-]", Pattern.CASE_INSENSITIVE);
086
087   protected static final Pattern GAP_PATTERN = Pattern.compile("-+");
088
089   //##########################################################################
090   // CONSTRUCTORS
091   //##########################################################################
092
093   //--------------------------------------------------------------------------
094   public BioSequenceImpl()
095   {
096
097   }
098
099   //--------------------------------------------------------------------------
100   public BioSequenceImpl(XMLNode inXML)
101   {
102      setID(inXML.getAttributeValue(HfgBioXML.ID_ATT));
103
104      XMLNode descriptionTag = inXML.getOptionalSubtagByName(HfgBioXML.DESCRIPTION_TAG);
105      if (descriptionTag != null) setDescription(descriptionTag.getContent());
106
107      setSequence(inXML.getContent());
108
109      XMLNode attributesTag = inXML.getOptionalSubtagByName(HfgBioXML.ATTRIBUTES_TAG);
110      if (attributesTag != null)
111      {
112         List<? extends XMLNode> attributeTags = attributesTag.getSubtagsByName(HfgBioXML.ATTRIBUTE_TAG);
113         for (XMLNode attrTag : attributeTags)
114         {
115            String name = attrTag.getAttributeValue(HfgBioXML.NAME_ATT);
116            String className = attrTag.getAttributeValue(HfgBioXML.CLASS_ATT);
117            if (attrTag.getAttributeValue(HfgBioXML.IS_NULL_ATT) != null)
118            {
119               setAttribute(name, null);
120            }
121            else
122            {
123               String value = attrTag.getUnescapedContent();
124               try
125               {
126                  Class clazz = Class.forName(className);
127                  if (String.class.getName().equals(className))
128                  {
129                     setAttribute(name, value);
130                  }
131                  else if (Integer.class.getName().equals(className))
132                  {
133                     setAttribute(name, Integer.valueOf(value));
134                  }
135                  else if (Long.class.getName().equals(className))
136                  {
137                     setAttribute(name, Long.valueOf(value));
138                  }
139                  else if (Float.class.getName().equals(className))
140                  {
141                     setAttribute(name, Float.valueOf(value));
142                  }
143                  else if (Double.class.getName().equals(className))
144                  {
145                     setAttribute(name, Double.valueOf(value));
146                  }
147                  else if (Boolean.class.getName().equals(className))
148                  {
149                     setAttribute(name, Boolean.valueOf(value));
150                  }
151                  else
152                  {
153                     try
154                     {
155                        Constructor constructor = clazz.getConstructor(XMLTag.class);
156                        setAttribute(name, constructor.newInstance(new XMLTag(new ByteArrayInputStream(value.getBytes()))));
157                     }
158                     catch (NoSuchMethodException e)
159                     {
160                        Constructor constructor = clazz.getConstructor(String.class);
161                        setAttribute(name, constructor.newInstance(value));
162                     }
163                  }
164               }
165               catch (Exception e)
166               {
167                  throw new RuntimeException("Problem extracting the value for seq attribute " + StringUtil.singleQuote(name) + "!", e);
168               }
169            }
170         }
171      }
172   }
173
174   //##########################################################################
175   // PUBLIC METHODS
176   //##########################################################################
177
178   //---------------------------------------------------------------------------
179   public BioSequence setCompressionThreshold(int inNumBytes)
180   {
181      mCompressionThreshold = inNumBytes;
182      return this;
183   }
184
185   //--------------------------------------------------------------------------
186   public BioSequenceType getType()
187   {
188      return null;
189   }
190
191   //--------------------------------------------------------------------------
192   @Override
193   public String toString()
194   {
195      return getSequence();
196   }
197
198   //--------------------------------------------------------------------------
199   @Override
200   public BioSequenceImpl clone()
201   {
202      clearCalculatedProperties();
203
204      BioSequenceImpl theClone = (BioSequenceImpl) super.clone();
205
206      if (mCompressedSeqChunks != null)
207      {
208         theClone.mCompressedSeqChunks = new ArrayList<>(mCompressedSeqChunks.size());
209         for (byte[] chunk : mCompressedSeqChunks)
210         {
211            theClone.mCompressedSeqChunks.add(chunk);
212         }
213      }
214
215      if (mComposition != null)
216      {
217         theClone.mComposition = new HashMap<>(mComposition.size());
218         for (String residue : mComposition.keySet())
219         {
220            theClone.mComposition.put(residue, mComposition.get(residue));
221         }
222      }
223
224      return theClone;
225   }
226
227   //---------------------------------------------------------------------------
228   @Override
229   public boolean equals(Object inObj2)
230   {
231      return (0 == compareTo(inObj2));
232   }
233
234   //--------------------------------------------------------------------------
235   public BioSequence setID(String inValue)
236   {
237      mID = inValue;
238      return this;
239   }
240
241   //--------------------------------------------------------------------------
242   public String getID()
243   {
244      return mID;
245   }
246
247   //--------------------------------------------------------------------------
248   public BioSequence setDescription(CharSequence inValue)
249   {
250      mDescription = (inValue != null ? inValue.toString() : null);
251      return this;
252   }
253
254   //--------------------------------------------------------------------------
255   public String getDescription()
256   {
257      return mDescription;
258   }
259
260   //--------------------------------------------------------------------------
261   public BioSequence setSequence(CharSequence inValue)
262   {
263      if (inValue != null)
264      {
265         setSequence(new StringReader(inValue.toString()));
266      }
267      else
268      {
269         mCompressedSeqChunks = null;
270         mUncompressedSequence = null;
271         clearElementalCompositionAndCalculatedProperties();
272      }
273
274      return this;
275   }
276
277   //--------------------------------------------------------------------------
278   public BioSequence setSequence(Reader inReader)
279      throws SeqIOException
280   {                                             // Remove whitespace and numbers
281      Reader bufferedReader = new BufferedReader(new SeqFilterReader(inReader));
282
283      boolean compressFlag = false;
284      int seqLength = 0;
285      StringBuilder uncompressedSeq = null;
286      try
287      {
288         char[] readBuffer = new char[1024];
289         int readSize = 0;
290         while ((readSize = bufferedReader.read(readBuffer)) >= 0)
291         {
292            if (readSize > mCompressionThreshold)
293            {
294               mCompressedSeqChunks = new ArrayList<>(4);
295               compressFlag = true;
296            }
297            
298            if (! compressFlag
299                && uncompressedSeq != null  
300                && uncompressedSeq.length() + readSize > mCompressionThreshold)
301            {
302               // Compress what has already accumulated in the uncompressed seq
303               mCompressedSeqChunks = new ArrayList<>(4);
304
305               if (uncompressedSeq.length() > 0)
306               {
307                  mCompressedSeqChunks.add(GZIP.compress(uncompressedSeq.toString()));
308                  seqLength = uncompressedSeq.length();
309               }
310               uncompressedSeq = null;
311
312               mCompressedSeqChunks.add(GZIP.compress(new String(readBuffer, 0, readSize)));
313               seqLength += readSize;
314               
315               compressFlag = true;
316               // Use a bigger read buffer moving forward
317               readBuffer = new char[mCompressionThreshold];
318            }
319            else if (compressFlag)
320            {
321               mCompressedSeqChunks.add(GZIP.compress(new String(readBuffer, 0, readSize)));
322               seqLength += readSize;
323            }
324            else
325            {
326               if (null == uncompressedSeq)
327               {
328                  uncompressedSeq = new StringBuilder(readSize); 
329               }
330               
331               uncompressedSeq.append(readBuffer, 0, readSize);
332            }
333         }
334      }
335      catch (SeqIOException e)
336      {
337         throw e;
338      }
339      catch (Exception e)
340      {
341         throw new SeqIOException(e);
342      }
343
344      if (uncompressedSeq != null)
345      {
346         mUncompressedSequence = uncompressedSeq.toString();
347         mSeqLength = mUncompressedSequence.length();
348      }
349      else
350      {
351         mSeqLength = seqLength;
352         mUncompressedSequence = null;
353      }
354
355      // Calculate checksums  TODO: Calculate on the stream as the sequence is set
356
357      clearElementalCompositionAndCalculatedProperties();
358      return this;
359   }
360
361   //--------------------------------------------------------------------------
362   public String getSequence()
363   {
364      String outSeq = null;
365
366      if (mUncompressedSequence != null)
367      {
368         outSeq = mUncompressedSequence;
369      }
370      else if (CollectionUtil.hasValues(mCompressedSeqChunks))
371      {
372         try
373         {
374            outSeq = StreamUtil.inputStreamToString(getSequenceStream());
375         }
376         catch (IOException e)
377         {
378            throw new RuntimeException(e);
379         }
380      }
381
382      return outSeq;
383   }
384
385   //--------------------------------------------------------------------------
386   public Reader getSequenceReader()
387   {
388      Reader seqReader = null;
389
390      InputStream seqStream = getSequenceStream();
391      if (seqStream != null)
392      {
393         seqReader = new InputStreamReader(seqStream);
394      }
395
396      return seqReader;
397   }
398
399   //--------------------------------------------------------------------------
400   public Reader getSubSequenceReader(SeqLocation inSeqLocation)
401   {
402      SeqLocation seqLocation = inSeqLocation;
403      if (null == seqLocation)
404      {
405         seqLocation = new SeqLocation(1, length());
406      }
407
408      return new SegmentReader(getSequenceReader(), seqLocation);
409   }
410
411   //--------------------------------------------------------------------------
412   public InputStream getSequenceStream()
413   {
414      InputStream seqStream = null;
415
416      if (mUncompressedSequence != null)
417      {
418         seqStream = new ByteArrayInputStream(mUncompressedSequence.getBytes());
419      }
420      else if (CollectionUtil.hasValues(mCompressedSeqChunks))
421      {
422         seqStream = new SeqStreamer(Direction.FORWARD);
423      }
424
425      return seqStream;
426   }
427
428   //--------------------------------------------------------------------------
429   public String getSubSequence(SeqLocation inSeqLocation)
430   {
431      String sequence = null;
432
433      Reader reader = null;
434      try
435      {
436         reader = getSubSequenceReader(inSeqLocation);
437         sequence = StreamUtil.readerToString(reader);
438      }
439      catch (IOException e)
440      {
441         throw new ProgrammingException(e);
442      }
443      finally
444      {
445         StreamUtil.close(reader);
446      }
447
448      return sequence;
449   }
450
451   //--------------------------------------------------------------------------
452   public byte[] getMD5Checksum()
453   {
454      if (null == mMD5Checksum
455          && getSequence() != null)
456      {
457         mMD5Checksum  = ChecksumUtil.calculateMD5(getSequence());
458      }
459
460      return mMD5Checksum;
461   }
462
463   //--------------------------------------------------------------------------
464   public byte[] getSHA1Checksum()
465   {
466      if (null == mSHA1Checksum
467          && getSequence() != null)
468      {
469         mSHA1Checksum = ChecksumUtil.calculateSHA1(getSequence());
470      }
471
472      return mSHA1Checksum;
473   }
474
475
476   //--------------------------------------------------------------------------
477   public boolean containsGaps()
478   {
479      return (getNumGaps() > 0);
480   }
481
482   //--------------------------------------------------------------------------
483   public Integer getTotalGapLength()
484   {
485      if (null == mTotalGapLength)
486      {
487         getNumGaps();
488      }
489
490      return mTotalGapLength;
491   }
492
493   //--------------------------------------------------------------------------
494   protected void setNumGaps(Integer inValue)
495   {
496      mNumGaps = inValue;
497   }
498
499   //--------------------------------------------------------------------------
500   protected void setTotalGapLength(Integer inValue)
501   {
502      mTotalGapLength = inValue;
503   }
504
505   //--------------------------------------------------------------------------
506   public Integer getNumGaps()
507   {
508      if (null == mNumGaps)
509      {
510         countGaps();
511      }
512
513      return mNumGaps;
514   }
515
516   //--------------------------------------------------------------------------
517   protected void countGaps()
518   {
519      Matcher m = GAP_PATTERN.matcher(getSequence());
520      int count = 0;
521      int totalGapLength = 0;
522      while (m.find())
523      {
524         count++;
525         totalGapLength += m.group(0).length();
526      }
527
528      mNumGaps = count;
529      mTotalGapLength = totalGapLength;
530   }
531
532
533   //--------------------------------------------------------------------------
534   public int length()
535   {
536      return mSeqLength;
537   }
538
539   //--------------------------------------------------------------------------
540   /**
541    Returns the residue character at the specified (1-based) sequence location.
542    @param inIndex the 1-based residue position
543    @return the residue at the specified position
544    */
545   public char residueAt(int inIndex)
546   {
547      if (inIndex <= 0
548          || inIndex > length())
549      {
550         throw new IndexOutOfBoundsException(inIndex + " is out of bounds of the sequence's length (" + length() + ")!");
551      }
552
553      Reader reader = getSequenceReader();
554      try
555      {
556         reader.skip(inIndex - 1);
557         return (char) reader.read();
558      }
559      catch (IOException e)
560      {
561         throw new RuntimeException(e);
562      }
563   }
564
565   //--------------------------------------------------------------------------
566   /**
567    Specifies the residue to use at the specified (1-based) sequence location.
568    * @param inIndex the 1-based residue position
569    * @param inResidue the residue to place at the specified position
570    */
571   public void setResidueAt(int inIndex, char inResidue)
572   {
573      if (inIndex <= 0
574          || inIndex > length())
575      {
576         throw new IndexOutOfBoundsException(inIndex + " is out of bounds of the sequence's length (" + length() + ")!");
577      }
578
579      StringBuilder buffer = new StringBuilder(getSequence());
580      buffer.setCharAt(inIndex - 1, inResidue);
581
582      setSequence(buffer.toString());
583   }
584
585
586   //--------------------------------------------------------------------------
587   @Override
588   public XMLNode toXMLNode()
589   {
590      XMLNode node = new XMLTag(HfgBioXML.HFGBIOSEQ_TAG);
591      node.setAttribute(HfgBioXML.ID_ATT, getID());
592      node.setAttribute(HfgBioXML.CLASS_ATT, getClass().getName());
593      node.setContent(getSequence());
594      if (StringUtil.isSet(getDescription()))
595      {
596         XMLNode descTag = new XMLTag(HfgBioXML.DESCRIPTION_TAG);
597         descTag.setContent(getDescription());
598         node.addSubtag(descTag);
599      }
600
601      if (hasAttributes())
602      {
603         XMLNode attributesTag = new XMLTag(HfgBioXML.ATTRIBUTES_TAG);
604         node.addSubtag(attributesTag);
605
606         for (String attrName : getAttributeNames())
607         {
608            XMLNode attributeTag = new XMLTag(HfgBioXML.ATTRIBUTE_TAG);
609            attributeTag.setAttribute(HfgBioXML.NAME_ATT, attrName);
610
611            Object value = getAttribute(attrName);
612            if (value != null)
613            {
614               attributeTag.setAttribute(HfgBioXML.CLASS_ATT, value.getClass().getName());
615               if (value instanceof XMLizable)
616               {
617                  attributeTag.addContentWithoutEscaping(((XMLizable)value).toXML());
618               }
619               else
620               {
621                  attributeTag.setContent(value.toString());
622               }
623            }
624            else
625            {
626               attributeTag.setAttribute(HfgBioXML.IS_NULL_ATT, 1);
627            }
628
629            attributesTag.addSubtag(attributeTag);
630         }
631      }
632
633      return node;
634   }
635
636   //--------------------------------------------------------------------------
637   /**
638    Returns a Map with 1-letter code Strings as keys and Integers as the values.
639    If a residue isn't present in the sequence, there won't be an entry for it in the map.
640    @return Map with 1-letter code Strings as keys and Integers as the values
641    */
642   protected Map<String, Integer> getComposition()
643   {
644      if (null == mComposition)
645      {
646         Map<String, int[]> map = new HashMap<>();
647
648         if (length() < mCompressionThreshold)
649         {
650            // This should be more performant when the sequence is short
651            String seq = getSequence();
652            for (int i = 0; i < seq.length(); i++)
653            {
654               char seqChar = seq.charAt(i);
655               int[] counter = map.get(seqChar + "");
656               if (null == counter)
657               {
658                  map.put(seqChar + "", new int[]{1});
659               }
660               else
661               {
662                  counter[0]++;
663               }
664            }
665         }
666         else
667         {
668            try
669            {
670               Reader seqReader = null;
671               try
672               {
673                  seqReader = getSequenceReader();
674                  if (seqReader != null)
675                  {
676                     int seqChar;
677                     while ((seqChar = seqReader.read()) != -1)
678                     {
679                        int[] counter = map.get((char) seqChar + "");
680                        if (null == counter)
681                        {
682                           map.put((char) seqChar + "", new int[]{1});
683                        }
684                        else
685                        {
686                           counter[0]++;
687                        }
688                     }
689                  }
690               }
691               finally
692               {
693                  if (seqReader != null) seqReader.close();
694               }
695            }
696            catch (IOException e)
697            {
698               throw new RuntimeException(e);
699            }
700         }
701
702         mComposition = new HashMap<>();
703
704         for (String residue : map.keySet())
705         {
706            mComposition.put(residue, map.get(residue)[0]);
707         }
708      }
709
710      return mComposition;
711   }
712
713   //--------------------------------------------------------------------------
714   /**
715    Returns an unmodifiable copy of the elemental composition Map. The keys are
716    Element objects and the values are Floats. Why Floats instead of Integers you
717    ask? Because some amino acid codes such as B and Z are ambiguous averages.
718    @return the elemental composition map
719    */
720   @Override
721   public Map<Element, Float> getElementalComposition()
722   {
723      Map<Element, Float> elementalComposition = super.getElementalComposition();
724      if (! CollectionUtil.hasValues(elementalComposition))
725      {
726         recalculateElementalComposition();
727         elementalComposition = super.getElementalComposition();
728      }
729
730      return elementalComposition;
731   }
732
733   //--------------------------------------------------------------------------
734   /**
735    Returns a chemical formula String like 'C5H11NO'. If carbon is present, it
736    is listed first followed by the other elements in ascending mass order.
737    Symbols for pure elements are enclosed in square brackets such as '[2H]2O'
738    for deuterated water.
739    @return the un-formatted chemical formula string
740    */
741   @Override
742   public String getChemicalFormula()
743   {
744      String chemicalFormula = super.getChemicalFormula();
745      if (null == chemicalFormula)
746      {
747         getElementalComposition();
748         chemicalFormula = super.getChemicalFormula();
749      }
750
751      return chemicalFormula;
752   }
753
754   //--------------------------------------------------------------------------
755   /**
756    Returns a chemical formula String like 'C₅H₁₁NO'. If carbon is present, it
757    is listed first followed by the other elements in ascending mass order.
758    Symbols for isotopes are enclosed in square brackets such as '[²H]₂O'
759    for deuterated water.
760    @return the chemical formula string
761    */
762   @Override
763   public String getChemicalFormulaWithSubscripts()
764   {
765      String chemicalFormula = super.getChemicalFormulaWithSubscripts();
766      if (null == chemicalFormula)
767      {
768         getElementalComposition();
769         chemicalFormula = super.getChemicalFormulaWithSubscripts();
770      }
771
772      return chemicalFormula;
773   }
774
775   //--------------------------------------------------------------------------
776   @Override
777   public Double getMonoisotopicMass()
778   {
779      Double mass = super.getMonoisotopicMass();
780      if (null == mass)
781      {
782         getElementalComposition();
783         mass = super.getMonoisotopicMass();
784      }
785
786      return mass;
787   }
788
789   //--------------------------------------------------------------------------
790   @Override
791   public Double getAverageMass()
792   {
793      Double mass = super.getAverageMass();
794      if (null == mass)
795      {
796         getElementalComposition();
797         mass = super.getAverageMass();
798      }
799
800      return mass;
801   }
802
803   //--------------------------------------------------------------------------
804   /**
805    Organic mass values used are from:
806    <pre>
807      <i>Zhang Z, Pan H, Chen X. 2009. Mass spectrometry for structural characterization
808      of therapeutic antibodies. Mass Spectrom Rev 28:147-176.</i>
809    </pre>
810    @return the organic average mass for the sequence
811    */
812   @Override
813   public Double getOrganicAverageMass()
814   {
815      Double mass = super.getOrganicAverageMass();
816      if (null == mass)
817      {
818         getElementalComposition();
819         mass = super.getOrganicAverageMass();
820      }
821
822      return mass;
823   }
824
825   //--------------------------------------------------------------------------
826   @Override
827   public void clearCalculatedProperties()
828   {
829      super.clearCalculatedProperties();
830
831      mComposition = null;
832      mNumGaps = null;
833      mTotalGapLength = null;
834   }
835
836   //##########################################################################
837   // PROTECTED METHODS
838   //##########################################################################
839
840
841   //--------------------------------------------------------------------------
842   // Setup this way to avoid a stackoverflow if clearElementalComposition() is called within clearCalculatedProperties().
843   public void clearElementalCompositionAndCalculatedProperties()
844   {
845      super.clearElementalComposition();
846      mComposition = null;
847      mNumGaps = null;
848      mTotalGapLength = null;
849   }
850
851
852   //--------------------------------------------------------------------------
853   // Needs to be implemented by extending classes.
854   protected Map<Molecule, Integer> getResidueComposition()
855   {
856      throw new UnimplementedMethodException("getResidueComposition() not implemented for this class!");
857   }
858
859   //--------------------------------------------------------------------------
860   // Needs to be implemented by extending classes.
861   protected Map<Molecule, Integer> getTerminiComposition()
862   {
863      throw new UnimplementedMethodException("getTerminiComposition() not implemented for this class!");
864   }
865
866   //--------------------------------------------------------------------------
867   // Needs to be implemented by extending classes.
868   protected Map<? extends Molecule, Integer> getXLinkComposition()
869   {
870      throw new UnimplementedMethodException("getXLinkComposition() not implemented for this class!");
871   }
872
873   //--------------------------------------------------------------------------
874   protected void recalculateElementalComposition()
875   {
876      clearElementalComposition();
877
878      // Residues
879      Map<Molecule, Integer> residueComposition = getResidueComposition();
880      if (CollectionUtil.hasValues(residueComposition))
881      {
882         for (Molecule residue : residueComposition.keySet())
883         {
884            int count = residueComposition.get(residue);
885            if (count > 0)
886            {
887               addElementalComposition(residue.getElementalComposition(), count);
888            }
889         }
890      }
891
892      // Termini
893      Map<Molecule, Integer> terminiComposition = getTerminiComposition();
894      if (CollectionUtil.hasValues(terminiComposition))
895      {
896         for (Molecule terminus : terminiComposition.keySet())
897         {
898            int count = terminiComposition.get(terminus);
899            if (count > 0)
900            {
901               addElementalComposition(terminus.getElementalComposition(), count);
902            }
903         }
904      }
905      
906      // X-Links?
907      Map<? extends Molecule, Integer> xLinkComposition = getXLinkComposition();
908      if (CollectionUtil.hasValues(xLinkComposition))
909      {
910         for (Molecule xlink : xLinkComposition.keySet())
911         {
912            int count = xLinkComposition.get(xlink);
913            if (count > 0)
914            {
915               addElementalComposition(xlink.getElementalComposition(), count);
916            }
917         }
918      }
919   }
920
921
922   //--------------------------------------------------------------------------
923   protected InputStream getReverseSequenceStream()
924   {
925      InputStream seqStream = null;
926
927      if (mUncompressedSequence != null)
928      {
929         seqStream = new ByteArrayInputStream(new StringBuilder(mUncompressedSequence).reverse().toString().getBytes());
930      }
931      else if (CollectionUtil.hasValues(mCompressedSeqChunks))
932      {
933         seqStream = new SeqStreamer(Direction.REVERSE);
934      }
935
936      return seqStream;
937   }
938
939   //##########################################################################
940   // INNER CLASSES
941   //##########################################################################
942
943
944   private enum Direction
945   {
946      FORWARD, REVERSE;
947   }
948
949   private class SeqStreamer extends InputStream
950   {
951      private Direction mDirection;
952      private String    mCurrentChunk;
953      private int       mCurrentChunkIndex;
954      private int       mCharIndex;
955      private boolean   mDone = false;
956
957      //-----------------------------------------------------------------------
958      public SeqStreamer(Direction inDirection)
959      {
960         mDirection = inDirection;
961         mCurrentChunkIndex = (mDirection == Direction.FORWARD ? 0 : mCompressedSeqChunks.size() - 1);
962      }
963
964      //-----------------------------------------------------------------------
965      public int read()
966      {
967         return (mDone ? -1 : getNextChar());
968      }
969
970      //-----------------------------------------------------------------------
971      private char getNextChar()
972      {
973         if (null == mCurrentChunk)
974         {
975            if (mDirection == Direction.FORWARD)
976            {
977               mCurrentChunk = GZIP.uncompressToString(mCompressedSeqChunks.get(mCurrentChunkIndex));
978            }
979            else
980            {
981               mCurrentChunk = new StringBuilder(GZIP.uncompressToString(mCompressedSeqChunks.get(mCurrentChunkIndex))).reverse().toString();
982
983            }
984
985            mCharIndex = 0;
986         }
987
988         char nextChar = mCurrentChunk.charAt(mCharIndex++);
989
990         if (mCharIndex >= mCurrentChunk.length())
991         {
992            // This is the last char in this chunk.
993            mCurrentChunk = null;
994            mCurrentChunkIndex = mCurrentChunkIndex + (mDirection == Direction.FORWARD ? 1 : - 1);
995            if (mCurrentChunkIndex < 0 || mCurrentChunkIndex == mCompressedSeqChunks.size())
996            {
997               // This was the last chunk.
998               mDone = true;
999            }
1000         }
1001
1002         return nextChar;
1003      }
1004   }
1005
1006
1007   public class SeqFilterReader extends FilterReader
1008   {
1009      private char[]  mBuffer = new char[8196];
1010      private int     mBufferLimit;
1011      private int     mBufferIndex;
1012      private boolean mEndOfStreamReached;
1013
1014      //---------------------------------------------------------------------------
1015      public SeqFilterReader(Reader inReader)
1016      {
1017         super(inReader);
1018      }
1019
1020      //---------------------------------------------------------------------------
1021      public int read()
1022            throws IOException
1023      {
1024         int returnChar;
1025/*
1026         do
1027         {
1028            if (mBufferIndex >= mBufferLimit)
1029            {
1030               fillBuffer();
1031            }
1032
1033            returnChar = (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]);
1034         }
1035         while (returnChar >= 0
1036                && returnChar != '*'  
1037                && returnChar != '-'
1038                && (returnChar < 65 || returnChar > 90)   // Not upper case letter
1039                && (returnChar < 97 || returnChar > 122)  // Not lower case letter
1040         );
1041*/
1042         while (true)
1043         {
1044            if (mBufferIndex >= mBufferLimit)
1045            {
1046               fillBuffer();
1047            }
1048
1049            returnChar = (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]);
1050            
1051            if ((returnChar > 64 && returnChar < 91) // Upper-case letter
1052                || (returnChar > 96 && returnChar < 123) // Lower-case letter
1053                || returnChar == '-' // Gap
1054                || returnChar == '*' // Stop codon
1055                || returnChar < 0)   // End of stream
1056            {
1057               break;
1058            }
1059         };
1060
1061         return returnChar;
1062      }
1063
1064      //---------------------------------------------------------------------------
1065      @Override
1066      public int read(char[] inBuffer, int inOffset, int inMaxReadLength)
1067            throws IOException
1068      {
1069         int theChar;
1070         int numCharsRead = 0;
1071         do
1072         {
1073            while (true)
1074            {
1075               if (mBufferIndex >= mBufferLimit)
1076               {
1077                  fillBuffer();
1078               }
1079
1080               theChar = (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]);
1081
1082               if ((theChar > 64 && theChar < 91) // Upper-case letter
1083                     || (theChar > 96 && theChar < 123) // Lower-case letter
1084                     || theChar == '-' // Gap
1085                     || theChar == '*' // Stop codon
1086                     || theChar < 0)   // End of stream
1087               {
1088                  break;
1089               }
1090            };
1091                        
1092            if (theChar > 0)
1093            {
1094               inBuffer[inOffset++] = (char) theChar;
1095               numCharsRead++;
1096            }
1097         }
1098         while (theChar >= 0
1099               && numCharsRead < inMaxReadLength);
1100
1101         return (theChar < 0 && 0 == numCharsRead ? -1 : numCharsRead);
1102      }
1103
1104      //---------------------------------------------------------------------------
1105      private void fillBuffer()
1106            throws IOException
1107      {
1108         mBufferLimit = super.in.read(mBuffer, 0, mBuffer.length);
1109
1110         if (-1 == mBufferLimit)
1111         {
1112            mEndOfStreamReached = true;
1113         }
1114
1115         // Reset the index
1116         mBufferIndex = 0;
1117      }
1118   }
1119
1120}