001package com.hfg.bio.seq; 002 003import java.lang.reflect.Constructor; 004import java.util.*; 005import java.io.*; 006import java.util.regex.Matcher; 007import java.util.regex.Pattern; 008 009import com.hfg.bio.*; 010import com.hfg.bio.seq.format.SeqIOException; 011import com.hfg.chem.Element; 012import com.hfg.chem.Molecule; 013import com.hfg.exception.ProgrammingException; 014import com.hfg.exception.UnimplementedMethodException; 015import com.hfg.util.ChecksumUtil; 016import com.hfg.util.io.GZIP; 017import com.hfg.util.io.SegmentReader; 018import com.hfg.util.io.StreamUtil; 019import com.hfg.util.collection.CollectionUtil; 020import com.hfg.util.StringUtil; 021import com.hfg.xml.XMLNode; 022import com.hfg.xml.XMLTag; 023import com.hfg.xml.XMLizable; 024 025 026//------------------------------------------------------------------------------ 027/** 028 Biological Protein, DNA, or RNA sequence. 029 <div> 030 @author J. Alex Taylor, hairyfatguy.com 031 </div> 032 */ 033//------------------------------------------------------------------------------ 034// com.hfg XML/HTML Coding Library 035// 036// This library is free software; you can redistribute it and/or 037// modify it under the terms of the GNU Lesser General Public 038// License as published by the Free Software Foundation; either 039// version 2.1 of the License, or (at your option) any later version. 040// 041// This library is distributed in the hope that it will be useful, 042// but WITHOUT ANY WARRANTY; without even the implied warranty of 043// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 044// Lesser General Public License for more details. 045// 046// You should have received a copy of the GNU Lesser General Public 047// License along with this library; if not, write to the Free Software 048// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 049// 050// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 051// jataylor@hairyfatguy.com 052//------------------------------------------------------------------------------ 053// TODO: 054// - Create a composition filter that will track the composition as the sequence is set. 055// - Restrict the sequence chars to the residue set. 056 057public abstract class BioSequenceImpl extends Molecule implements Cloneable, BioSequence 058{ 059 060 //########################################################################## 061 // PRIVATE FIELDS 062 //########################################################################## 063 064 private String mID; 065 private String mDescription; 066 private String mUncompressedSequence; 067 private List<byte[]> mCompressedSeqChunks; 068 private int mSeqLength; 069 private Integer mNumGaps; 070 private Integer mTotalGapLength; 071 072 private Map<String, Integer> mComposition; 073 074 private byte[] mMD5Checksum; 075 private byte[] mSHA1Checksum; 076 077 // How long the sequence should be before compression is used. 078 private int mCompressionThreshold = sDefaultCompressionThreshold; 079 080 // Default sequence compression threshold. 081 private static int sDefaultCompressionThreshold = 4 * 1024; 082 083 // Pattern of the characters that should get removed when setting the sequence 084// private static final Pattern SEQUENCE_PURIFICATION_PATTERN = Pattern.compile("[\\s\\d;_'\\\"]"); 085 private static final Pattern SEQUENCE_PURIFICATION_PATTERN = Pattern.compile("[^a-z\\*\\-]", Pattern.CASE_INSENSITIVE); 086 087 protected static final Pattern GAP_PATTERN = Pattern.compile("-+"); 088 089 //########################################################################## 090 // CONSTRUCTORS 091 //########################################################################## 092 093 //-------------------------------------------------------------------------- 094 public BioSequenceImpl() 095 { 096 097 } 098 099 //-------------------------------------------------------------------------- 100 public BioSequenceImpl(XMLNode inXML) 101 { 102 setID(inXML.getAttributeValue(HfgBioXML.ID_ATT)); 103 104 XMLNode descriptionTag = inXML.getOptionalSubtagByName(HfgBioXML.DESCRIPTION_TAG); 105 if (descriptionTag != null) setDescription(descriptionTag.getContent()); 106 107 setSequence(inXML.getContent()); 108 109 XMLNode attributesTag = inXML.getOptionalSubtagByName(HfgBioXML.ATTRIBUTES_TAG); 110 if (attributesTag != null) 111 { 112 List<? extends XMLNode> attributeTags = attributesTag.getSubtagsByName(HfgBioXML.ATTRIBUTE_TAG); 113 for (XMLNode attrTag : attributeTags) 114 { 115 String name = attrTag.getAttributeValue(HfgBioXML.NAME_ATT); 116 String className = attrTag.getAttributeValue(HfgBioXML.CLASS_ATT); 117 if (attrTag.getAttributeValue(HfgBioXML.IS_NULL_ATT) != null) 118 { 119 setAttribute(name, null); 120 } 121 else 122 { 123 String value = attrTag.getUnescapedContent(); 124 try 125 { 126 Class clazz = Class.forName(className); 127 if (String.class.getName().equals(className)) 128 { 129 setAttribute(name, value); 130 } 131 else if (Integer.class.getName().equals(className)) 132 { 133 setAttribute(name, Integer.valueOf(value)); 134 } 135 else if (Long.class.getName().equals(className)) 136 { 137 setAttribute(name, Long.valueOf(value)); 138 } 139 else if (Float.class.getName().equals(className)) 140 { 141 setAttribute(name, Float.valueOf(value)); 142 } 143 else if (Double.class.getName().equals(className)) 144 { 145 setAttribute(name, Double.valueOf(value)); 146 } 147 else if (Boolean.class.getName().equals(className)) 148 { 149 setAttribute(name, Boolean.valueOf(value)); 150 } 151 else 152 { 153 try 154 { 155 Constructor constructor = clazz.getConstructor(XMLTag.class); 156 setAttribute(name, constructor.newInstance(new XMLTag(new ByteArrayInputStream(value.getBytes())))); 157 } 158 catch (NoSuchMethodException e) 159 { 160 Constructor constructor = clazz.getConstructor(String.class); 161 setAttribute(name, constructor.newInstance(value)); 162 } 163 } 164 } 165 catch (Exception e) 166 { 167 throw new RuntimeException("Problem extracting the value for seq attribute " + StringUtil.singleQuote(name) + "!", e); 168 } 169 } 170 } 171 } 172 } 173 174 //########################################################################## 175 // PUBLIC METHODS 176 //########################################################################## 177 178 //--------------------------------------------------------------------------- 179 public BioSequence setCompressionThreshold(int inNumBytes) 180 { 181 mCompressionThreshold = inNumBytes; 182 return this; 183 } 184 185 //-------------------------------------------------------------------------- 186 public BioSequenceType getType() 187 { 188 return null; 189 } 190 191 //-------------------------------------------------------------------------- 192 @Override 193 public String toString() 194 { 195 return getSequence(); 196 } 197 198 //-------------------------------------------------------------------------- 199 @Override 200 public BioSequenceImpl clone() 201 { 202 clearCalculatedProperties(); 203 204 BioSequenceImpl theClone = (BioSequenceImpl) super.clone(); 205 206 if (mCompressedSeqChunks != null) 207 { 208 theClone.mCompressedSeqChunks = new ArrayList<>(mCompressedSeqChunks.size()); 209 for (byte[] chunk : mCompressedSeqChunks) 210 { 211 theClone.mCompressedSeqChunks.add(chunk); 212 } 213 } 214 215 if (mComposition != null) 216 { 217 theClone.mComposition = new HashMap<>(mComposition.size()); 218 for (String residue : mComposition.keySet()) 219 { 220 theClone.mComposition.put(residue, mComposition.get(residue)); 221 } 222 } 223 224 return theClone; 225 } 226 227 //--------------------------------------------------------------------------- 228 @Override 229 public boolean equals(Object inObj2) 230 { 231 return (0 == compareTo(inObj2)); 232 } 233 234 //-------------------------------------------------------------------------- 235 public BioSequence setID(String inValue) 236 { 237 mID = inValue; 238 return this; 239 } 240 241 //-------------------------------------------------------------------------- 242 public String getID() 243 { 244 return mID; 245 } 246 247 //-------------------------------------------------------------------------- 248 public BioSequence setDescription(CharSequence inValue) 249 { 250 mDescription = (inValue != null ? inValue.toString() : null); 251 return this; 252 } 253 254 //-------------------------------------------------------------------------- 255 public String getDescription() 256 { 257 return mDescription; 258 } 259 260 //-------------------------------------------------------------------------- 261 public BioSequence setSequence(CharSequence inValue) 262 { 263 if (inValue != null) 264 { 265 setSequence(new StringReader(inValue.toString())); 266 } 267 else 268 { 269 mCompressedSeqChunks = null; 270 mUncompressedSequence = null; 271 clearElementalCompositionAndCalculatedProperties(); 272 } 273 274 return this; 275 } 276 277 //-------------------------------------------------------------------------- 278 public BioSequence setSequence(Reader inReader) 279 throws SeqIOException 280 { // Remove whitespace and numbers 281 Reader bufferedReader = new BufferedReader(new SeqFilterReader(inReader)); 282 283 boolean compressFlag = false; 284 int seqLength = 0; 285 StringBuilder uncompressedSeq = null; 286 try 287 { 288 char[] readBuffer = new char[1024]; 289 int readSize = 0; 290 while ((readSize = bufferedReader.read(readBuffer)) >= 0) 291 { 292 if (readSize > mCompressionThreshold) 293 { 294 mCompressedSeqChunks = new ArrayList<>(4); 295 compressFlag = true; 296 } 297 298 if (! compressFlag 299 && uncompressedSeq != null 300 && uncompressedSeq.length() + readSize > mCompressionThreshold) 301 { 302 // Compress what has already accumulated in the uncompressed seq 303 mCompressedSeqChunks = new ArrayList<>(4); 304 305 if (uncompressedSeq.length() > 0) 306 { 307 mCompressedSeqChunks.add(GZIP.compress(uncompressedSeq.toString())); 308 seqLength = uncompressedSeq.length(); 309 } 310 uncompressedSeq = null; 311 312 mCompressedSeqChunks.add(GZIP.compress(new String(readBuffer, 0, readSize))); 313 seqLength += readSize; 314 315 compressFlag = true; 316 // Use a bigger read buffer moving forward 317 readBuffer = new char[mCompressionThreshold]; 318 } 319 else if (compressFlag) 320 { 321 mCompressedSeqChunks.add(GZIP.compress(new String(readBuffer, 0, readSize))); 322 seqLength += readSize; 323 } 324 else 325 { 326 if (null == uncompressedSeq) 327 { 328 uncompressedSeq = new StringBuilder(readSize); 329 } 330 331 uncompressedSeq.append(readBuffer, 0, readSize); 332 } 333 } 334 } 335 catch (SeqIOException e) 336 { 337 throw e; 338 } 339 catch (Exception e) 340 { 341 throw new SeqIOException(e); 342 } 343 344 if (uncompressedSeq != null) 345 { 346 mUncompressedSequence = uncompressedSeq.toString(); 347 mSeqLength = mUncompressedSequence.length(); 348 } 349 else 350 { 351 mSeqLength = seqLength; 352 mUncompressedSequence = null; 353 } 354 355 // Calculate checksums TODO: Calculate on the stream as the sequence is set 356 357 clearElementalCompositionAndCalculatedProperties(); 358 return this; 359 } 360 361 //-------------------------------------------------------------------------- 362 public String getSequence() 363 { 364 String outSeq = null; 365 366 if (mUncompressedSequence != null) 367 { 368 outSeq = mUncompressedSequence; 369 } 370 else if (CollectionUtil.hasValues(mCompressedSeqChunks)) 371 { 372 try 373 { 374 outSeq = StreamUtil.inputStreamToString(getSequenceStream()); 375 } 376 catch (IOException e) 377 { 378 throw new RuntimeException(e); 379 } 380 } 381 382 return outSeq; 383 } 384 385 //-------------------------------------------------------------------------- 386 public Reader getSequenceReader() 387 { 388 Reader seqReader = null; 389 390 InputStream seqStream = getSequenceStream(); 391 if (seqStream != null) 392 { 393 seqReader = new InputStreamReader(seqStream); 394 } 395 396 return seqReader; 397 } 398 399 //-------------------------------------------------------------------------- 400 public Reader getSubSequenceReader(SeqLocation inSeqLocation) 401 { 402 SeqLocation seqLocation = inSeqLocation; 403 if (null == seqLocation) 404 { 405 seqLocation = new SeqLocation(1, length()); 406 } 407 408 return new SegmentReader(getSequenceReader(), seqLocation); 409 } 410 411 //-------------------------------------------------------------------------- 412 public InputStream getSequenceStream() 413 { 414 InputStream seqStream = null; 415 416 if (mUncompressedSequence != null) 417 { 418 seqStream = new ByteArrayInputStream(mUncompressedSequence.getBytes()); 419 } 420 else if (CollectionUtil.hasValues(mCompressedSeqChunks)) 421 { 422 seqStream = new SeqStreamer(Direction.FORWARD); 423 } 424 425 return seqStream; 426 } 427 428 //-------------------------------------------------------------------------- 429 public String getSubSequence(SeqLocation inSeqLocation) 430 { 431 String sequence = null; 432 433 Reader reader = null; 434 try 435 { 436 reader = getSubSequenceReader(inSeqLocation); 437 sequence = StreamUtil.readerToString(reader); 438 } 439 catch (IOException e) 440 { 441 throw new ProgrammingException(e); 442 } 443 finally 444 { 445 StreamUtil.close(reader); 446 } 447 448 return sequence; 449 } 450 451 //-------------------------------------------------------------------------- 452 public byte[] getMD5Checksum() 453 { 454 if (null == mMD5Checksum 455 && getSequence() != null) 456 { 457 mMD5Checksum = ChecksumUtil.calculateMD5(getSequence()); 458 } 459 460 return mMD5Checksum; 461 } 462 463 //-------------------------------------------------------------------------- 464 public byte[] getSHA1Checksum() 465 { 466 if (null == mSHA1Checksum 467 && getSequence() != null) 468 { 469 mSHA1Checksum = ChecksumUtil.calculateSHA1(getSequence()); 470 } 471 472 return mSHA1Checksum; 473 } 474 475 476 //-------------------------------------------------------------------------- 477 public boolean containsGaps() 478 { 479 return (getNumGaps() > 0); 480 } 481 482 //-------------------------------------------------------------------------- 483 public Integer getTotalGapLength() 484 { 485 if (null == mTotalGapLength) 486 { 487 getNumGaps(); 488 } 489 490 return mTotalGapLength; 491 } 492 493 //-------------------------------------------------------------------------- 494 protected void setNumGaps(Integer inValue) 495 { 496 mNumGaps = inValue; 497 } 498 499 //-------------------------------------------------------------------------- 500 protected void setTotalGapLength(Integer inValue) 501 { 502 mTotalGapLength = inValue; 503 } 504 505 //-------------------------------------------------------------------------- 506 public Integer getNumGaps() 507 { 508 if (null == mNumGaps) 509 { 510 countGaps(); 511 } 512 513 return mNumGaps; 514 } 515 516 //-------------------------------------------------------------------------- 517 protected void countGaps() 518 { 519 Matcher m = GAP_PATTERN.matcher(getSequence()); 520 int count = 0; 521 int totalGapLength = 0; 522 while (m.find()) 523 { 524 count++; 525 totalGapLength += m.group(0).length(); 526 } 527 528 mNumGaps = count; 529 mTotalGapLength = totalGapLength; 530 } 531 532 533 //-------------------------------------------------------------------------- 534 public int length() 535 { 536 return mSeqLength; 537 } 538 539 //-------------------------------------------------------------------------- 540 /** 541 Returns the residue character at the specified (1-based) sequence location. 542 @param inIndex the 1-based residue position 543 @return the residue at the specified position 544 */ 545 public char residueAt(int inIndex) 546 { 547 if (inIndex <= 0 548 || inIndex > length()) 549 { 550 throw new IndexOutOfBoundsException(inIndex + " is out of bounds of the sequence's length (" + length() + ")!"); 551 } 552 553 Reader reader = getSequenceReader(); 554 try 555 { 556 reader.skip(inIndex - 1); 557 return (char) reader.read(); 558 } 559 catch (IOException e) 560 { 561 throw new RuntimeException(e); 562 } 563 } 564 565 //-------------------------------------------------------------------------- 566 /** 567 Specifies the residue to use at the specified (1-based) sequence location. 568 * @param inIndex the 1-based residue position 569 * @param inResidue the residue to place at the specified position 570 */ 571 public void setResidueAt(int inIndex, char inResidue) 572 { 573 if (inIndex <= 0 574 || inIndex > length()) 575 { 576 throw new IndexOutOfBoundsException(inIndex + " is out of bounds of the sequence's length (" + length() + ")!"); 577 } 578 579 StringBuilder buffer = new StringBuilder(getSequence()); 580 buffer.setCharAt(inIndex - 1, inResidue); 581 582 setSequence(buffer.toString()); 583 } 584 585 586 //-------------------------------------------------------------------------- 587 @Override 588 public XMLNode toXMLNode() 589 { 590 XMLNode node = new XMLTag(HfgBioXML.HFGBIOSEQ_TAG); 591 node.setAttribute(HfgBioXML.ID_ATT, getID()); 592 node.setAttribute(HfgBioXML.CLASS_ATT, getClass().getName()); 593 node.setContent(getSequence()); 594 if (StringUtil.isSet(getDescription())) 595 { 596 XMLNode descTag = new XMLTag(HfgBioXML.DESCRIPTION_TAG); 597 descTag.setContent(getDescription()); 598 node.addSubtag(descTag); 599 } 600 601 if (hasAttributes()) 602 { 603 XMLNode attributesTag = new XMLTag(HfgBioXML.ATTRIBUTES_TAG); 604 node.addSubtag(attributesTag); 605 606 for (String attrName : getAttributeNames()) 607 { 608 XMLNode attributeTag = new XMLTag(HfgBioXML.ATTRIBUTE_TAG); 609 attributeTag.setAttribute(HfgBioXML.NAME_ATT, attrName); 610 611 Object value = getAttribute(attrName); 612 if (value != null) 613 { 614 attributeTag.setAttribute(HfgBioXML.CLASS_ATT, value.getClass().getName()); 615 if (value instanceof XMLizable) 616 { 617 attributeTag.addContentWithoutEscaping(((XMLizable)value).toXML()); 618 } 619 else 620 { 621 attributeTag.setContent(value.toString()); 622 } 623 } 624 else 625 { 626 attributeTag.setAttribute(HfgBioXML.IS_NULL_ATT, 1); 627 } 628 629 attributesTag.addSubtag(attributeTag); 630 } 631 } 632 633 return node; 634 } 635 636 //-------------------------------------------------------------------------- 637 /** 638 Returns a Map with 1-letter code Strings as keys and Integers as the values. 639 If a residue isn't present in the sequence, there won't be an entry for it in the map. 640 @return Map with 1-letter code Strings as keys and Integers as the values 641 */ 642 protected Map<String, Integer> getComposition() 643 { 644 if (null == mComposition) 645 { 646 Map<String, int[]> map = new HashMap<>(); 647 648 if (length() < mCompressionThreshold) 649 { 650 // This should be more performant when the sequence is short 651 String seq = getSequence(); 652 for (int i = 0; i < seq.length(); i++) 653 { 654 char seqChar = seq.charAt(i); 655 int[] counter = map.get(seqChar + ""); 656 if (null == counter) 657 { 658 map.put(seqChar + "", new int[]{1}); 659 } 660 else 661 { 662 counter[0]++; 663 } 664 } 665 } 666 else 667 { 668 try 669 { 670 Reader seqReader = null; 671 try 672 { 673 seqReader = getSequenceReader(); 674 if (seqReader != null) 675 { 676 int seqChar; 677 while ((seqChar = seqReader.read()) != -1) 678 { 679 int[] counter = map.get((char) seqChar + ""); 680 if (null == counter) 681 { 682 map.put((char) seqChar + "", new int[]{1}); 683 } 684 else 685 { 686 counter[0]++; 687 } 688 } 689 } 690 } 691 finally 692 { 693 if (seqReader != null) seqReader.close(); 694 } 695 } 696 catch (IOException e) 697 { 698 throw new RuntimeException(e); 699 } 700 } 701 702 mComposition = new HashMap<>(); 703 704 for (String residue : map.keySet()) 705 { 706 mComposition.put(residue, map.get(residue)[0]); 707 } 708 } 709 710 return mComposition; 711 } 712 713 //-------------------------------------------------------------------------- 714 /** 715 Returns an unmodifiable copy of the elemental composition Map. The keys are 716 Element objects and the values are Floats. Why Floats instead of Integers you 717 ask? Because some amino acid codes such as B and Z are ambiguous averages. 718 @return the elemental composition map 719 */ 720 @Override 721 public Map<Element, Float> getElementalComposition() 722 { 723 Map<Element, Float> elementalComposition = super.getElementalComposition(); 724 if (! CollectionUtil.hasValues(elementalComposition)) 725 { 726 recalculateElementalComposition(); 727 elementalComposition = super.getElementalComposition(); 728 } 729 730 return elementalComposition; 731 } 732 733 //-------------------------------------------------------------------------- 734 /** 735 Returns a chemical formula String like 'C5H11NO'. If carbon is present, it 736 is listed first followed by the other elements in ascending mass order. 737 Symbols for pure elements are enclosed in square brackets such as '[2H]2O' 738 for deuterated water. 739 @return the un-formatted chemical formula string 740 */ 741 @Override 742 public String getChemicalFormula() 743 { 744 String chemicalFormula = super.getChemicalFormula(); 745 if (null == chemicalFormula) 746 { 747 getElementalComposition(); 748 chemicalFormula = super.getChemicalFormula(); 749 } 750 751 return chemicalFormula; 752 } 753 754 //-------------------------------------------------------------------------- 755 /** 756 Returns a chemical formula String like 'C₅H₁₁NO'. If carbon is present, it 757 is listed first followed by the other elements in ascending mass order. 758 Symbols for isotopes are enclosed in square brackets such as '[²H]₂O' 759 for deuterated water. 760 @return the chemical formula string 761 */ 762 @Override 763 public String getChemicalFormulaWithSubscripts() 764 { 765 String chemicalFormula = super.getChemicalFormulaWithSubscripts(); 766 if (null == chemicalFormula) 767 { 768 getElementalComposition(); 769 chemicalFormula = super.getChemicalFormulaWithSubscripts(); 770 } 771 772 return chemicalFormula; 773 } 774 775 //-------------------------------------------------------------------------- 776 @Override 777 public Double getMonoisotopicMass() 778 { 779 Double mass = super.getMonoisotopicMass(); 780 if (null == mass) 781 { 782 getElementalComposition(); 783 mass = super.getMonoisotopicMass(); 784 } 785 786 return mass; 787 } 788 789 //-------------------------------------------------------------------------- 790 @Override 791 public Double getAverageMass() 792 { 793 Double mass = super.getAverageMass(); 794 if (null == mass) 795 { 796 getElementalComposition(); 797 mass = super.getAverageMass(); 798 } 799 800 return mass; 801 } 802 803 //-------------------------------------------------------------------------- 804 /** 805 Organic mass values used are from: 806 <pre> 807 <i>Zhang Z, Pan H, Chen X. 2009. Mass spectrometry for structural characterization 808 of therapeutic antibodies. Mass Spectrom Rev 28:147-176.</i> 809 </pre> 810 @return the organic average mass for the sequence 811 */ 812 @Override 813 public Double getOrganicAverageMass() 814 { 815 Double mass = super.getOrganicAverageMass(); 816 if (null == mass) 817 { 818 getElementalComposition(); 819 mass = super.getOrganicAverageMass(); 820 } 821 822 return mass; 823 } 824 825 //-------------------------------------------------------------------------- 826 @Override 827 public void clearCalculatedProperties() 828 { 829 super.clearCalculatedProperties(); 830 831 mComposition = null; 832 mNumGaps = null; 833 mTotalGapLength = null; 834 } 835 836 //########################################################################## 837 // PROTECTED METHODS 838 //########################################################################## 839 840 841 //-------------------------------------------------------------------------- 842 // Setup this way to avoid a stackoverflow if clearElementalComposition() is called within clearCalculatedProperties(). 843 public void clearElementalCompositionAndCalculatedProperties() 844 { 845 super.clearElementalComposition(); 846 mComposition = null; 847 mNumGaps = null; 848 mTotalGapLength = null; 849 } 850 851 852 //-------------------------------------------------------------------------- 853 // Needs to be implemented by extending classes. 854 protected Map<Molecule, Integer> getResidueComposition() 855 { 856 throw new UnimplementedMethodException("getResidueComposition() not implemented for this class!"); 857 } 858 859 //-------------------------------------------------------------------------- 860 // Needs to be implemented by extending classes. 861 protected Map<Molecule, Integer> getTerminiComposition() 862 { 863 throw new UnimplementedMethodException("getTerminiComposition() not implemented for this class!"); 864 } 865 866 //-------------------------------------------------------------------------- 867 // Needs to be implemented by extending classes. 868 protected Map<? extends Molecule, Integer> getXLinkComposition() 869 { 870 throw new UnimplementedMethodException("getXLinkComposition() not implemented for this class!"); 871 } 872 873 //-------------------------------------------------------------------------- 874 protected void recalculateElementalComposition() 875 { 876 clearElementalComposition(); 877 878 // Residues 879 Map<Molecule, Integer> residueComposition = getResidueComposition(); 880 if (CollectionUtil.hasValues(residueComposition)) 881 { 882 for (Molecule residue : residueComposition.keySet()) 883 { 884 int count = residueComposition.get(residue); 885 if (count > 0) 886 { 887 addElementalComposition(residue.getElementalComposition(), count); 888 } 889 } 890 } 891 892 // Termini 893 Map<Molecule, Integer> terminiComposition = getTerminiComposition(); 894 if (CollectionUtil.hasValues(terminiComposition)) 895 { 896 for (Molecule terminus : terminiComposition.keySet()) 897 { 898 int count = terminiComposition.get(terminus); 899 if (count > 0) 900 { 901 addElementalComposition(terminus.getElementalComposition(), count); 902 } 903 } 904 } 905 906 // X-Links? 907 Map<? extends Molecule, Integer> xLinkComposition = getXLinkComposition(); 908 if (CollectionUtil.hasValues(xLinkComposition)) 909 { 910 for (Molecule xlink : xLinkComposition.keySet()) 911 { 912 int count = xLinkComposition.get(xlink); 913 if (count > 0) 914 { 915 addElementalComposition(xlink.getElementalComposition(), count); 916 } 917 } 918 } 919 } 920 921 922 //-------------------------------------------------------------------------- 923 protected InputStream getReverseSequenceStream() 924 { 925 InputStream seqStream = null; 926 927 if (mUncompressedSequence != null) 928 { 929 seqStream = new ByteArrayInputStream(new StringBuilder(mUncompressedSequence).reverse().toString().getBytes()); 930 } 931 else if (CollectionUtil.hasValues(mCompressedSeqChunks)) 932 { 933 seqStream = new SeqStreamer(Direction.REVERSE); 934 } 935 936 return seqStream; 937 } 938 939 //########################################################################## 940 // INNER CLASSES 941 //########################################################################## 942 943 944 private enum Direction 945 { 946 FORWARD, REVERSE; 947 } 948 949 private class SeqStreamer extends InputStream 950 { 951 private Direction mDirection; 952 private String mCurrentChunk; 953 private int mCurrentChunkIndex; 954 private int mCharIndex; 955 private boolean mDone = false; 956 957 //----------------------------------------------------------------------- 958 public SeqStreamer(Direction inDirection) 959 { 960 mDirection = inDirection; 961 mCurrentChunkIndex = (mDirection == Direction.FORWARD ? 0 : mCompressedSeqChunks.size() - 1); 962 } 963 964 //----------------------------------------------------------------------- 965 public int read() 966 { 967 return (mDone ? -1 : getNextChar()); 968 } 969 970 //----------------------------------------------------------------------- 971 private char getNextChar() 972 { 973 if (null == mCurrentChunk) 974 { 975 if (mDirection == Direction.FORWARD) 976 { 977 mCurrentChunk = GZIP.uncompressToString(mCompressedSeqChunks.get(mCurrentChunkIndex)); 978 } 979 else 980 { 981 mCurrentChunk = new StringBuilder(GZIP.uncompressToString(mCompressedSeqChunks.get(mCurrentChunkIndex))).reverse().toString(); 982 983 } 984 985 mCharIndex = 0; 986 } 987 988 char nextChar = mCurrentChunk.charAt(mCharIndex++); 989 990 if (mCharIndex >= mCurrentChunk.length()) 991 { 992 // This is the last char in this chunk. 993 mCurrentChunk = null; 994 mCurrentChunkIndex = mCurrentChunkIndex + (mDirection == Direction.FORWARD ? 1 : - 1); 995 if (mCurrentChunkIndex < 0 || mCurrentChunkIndex == mCompressedSeqChunks.size()) 996 { 997 // This was the last chunk. 998 mDone = true; 999 } 1000 } 1001 1002 return nextChar; 1003 } 1004 } 1005 1006 1007 public class SeqFilterReader extends FilterReader 1008 { 1009 private char[] mBuffer = new char[8196]; 1010 private int mBufferLimit; 1011 private int mBufferIndex; 1012 private boolean mEndOfStreamReached; 1013 1014 //--------------------------------------------------------------------------- 1015 public SeqFilterReader(Reader inReader) 1016 { 1017 super(inReader); 1018 } 1019 1020 //--------------------------------------------------------------------------- 1021 public int read() 1022 throws IOException 1023 { 1024 int returnChar; 1025/* 1026 do 1027 { 1028 if (mBufferIndex >= mBufferLimit) 1029 { 1030 fillBuffer(); 1031 } 1032 1033 returnChar = (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]); 1034 } 1035 while (returnChar >= 0 1036 && returnChar != '*' 1037 && returnChar != '-' 1038 && (returnChar < 65 || returnChar > 90) // Not upper case letter 1039 && (returnChar < 97 || returnChar > 122) // Not lower case letter 1040 ); 1041*/ 1042 while (true) 1043 { 1044 if (mBufferIndex >= mBufferLimit) 1045 { 1046 fillBuffer(); 1047 } 1048 1049 returnChar = (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]); 1050 1051 if ((returnChar > 64 && returnChar < 91) // Upper-case letter 1052 || (returnChar > 96 && returnChar < 123) // Lower-case letter 1053 || returnChar == '-' // Gap 1054 || returnChar == '*' // Stop codon 1055 || returnChar < 0) // End of stream 1056 { 1057 break; 1058 } 1059 }; 1060 1061 return returnChar; 1062 } 1063 1064 //--------------------------------------------------------------------------- 1065 @Override 1066 public int read(char[] inBuffer, int inOffset, int inMaxReadLength) 1067 throws IOException 1068 { 1069 int theChar; 1070 int numCharsRead = 0; 1071 do 1072 { 1073 while (true) 1074 { 1075 if (mBufferIndex >= mBufferLimit) 1076 { 1077 fillBuffer(); 1078 } 1079 1080 theChar = (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]); 1081 1082 if ((theChar > 64 && theChar < 91) // Upper-case letter 1083 || (theChar > 96 && theChar < 123) // Lower-case letter 1084 || theChar == '-' // Gap 1085 || theChar == '*' // Stop codon 1086 || theChar < 0) // End of stream 1087 { 1088 break; 1089 } 1090 }; 1091 1092 if (theChar > 0) 1093 { 1094 inBuffer[inOffset++] = (char) theChar; 1095 numCharsRead++; 1096 } 1097 } 1098 while (theChar >= 0 1099 && numCharsRead < inMaxReadLength); 1100 1101 return (theChar < 0 && 0 == numCharsRead ? -1 : numCharsRead); 1102 } 1103 1104 //--------------------------------------------------------------------------- 1105 private void fillBuffer() 1106 throws IOException 1107 { 1108 mBufferLimit = super.in.read(mBuffer, 0, mBuffer.length); 1109 1110 if (-1 == mBufferLimit) 1111 { 1112 mEndOfStreamReached = true; 1113 } 1114 1115 // Reset the index 1116 mBufferIndex = 0; 1117 } 1118 } 1119 1120}