001package com.hfg.xml.parser; 002 003import java.util.*; 004import java.io.InputStream; 005import java.io.IOException; 006import java.io.InputStreamReader; 007import java.io.BufferedReader; 008import javax.xml.parsers.SAXParser; 009 010import org.xml.sax.*; 011import org.xml.sax.ext.LexicalHandler; 012import org.xml.sax.helpers.AttributesImpl; 013 014import com.hfg.util.BooleanUtil; 015import com.hfg.xml.Doctype; 016import com.hfg.xml.XMLNamespace; 017 018//------------------------------------------------------------------------------ 019/** 020 SaxyParser is a lightweight SAX parser. 021 <div> 022 Important (for me) differences between SaxyParser and Xerces: 023 <ul> 024 <li>SaxyParser does NOT close InputSources upon the completion of parsing.</li> 025 <li>SaxyParser can handle the interleaving of content and subtags.</li> 026 </ul> 027 </div> 028 @author J. Alex Taylor, hairyfatguy.com 029 */ 030//------------------------------------------------------------------------------ 031// com.hfg XML/HTML Coding Library 032// 033// This library is free software; you can redistribute it and/or 034// modify it under the terms of the GNU Lesser General Public 035// License as published by the Free Software Foundation; either 036// version 2.1 of the License, or (at your option) any later version. 037// 038// This library is distributed in the hope that it will be useful, 039// but WITHOUT ANY WARRANTY; without even the implied warranty of 040// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 041// Lesser General Public License for more details. 042// 043// You should have received a copy of the GNU Lesser General Public 044// License along with this library; if not, write to the Free Software 045// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 046// 047// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 048// jataylor@hairyfatguy.com 049//------------------------------------------------------------------------------ 050 051public class SaxyParser extends SAXParser implements XMLReader 052{ 053 //########################################################################### 054 // PUBLIC FIELDS 055 //########################################################################### 056 057 /** 058 Parser property to enable entity expansion. 059 <pre> 060 http://hairyfatguy.com/sax/properties/entity-expansion 061 </pre> 062 False by default. 063 */ 064 public static final String ENTITY_EXPANSION_PROPERTY = "http://hairyfatguy.com/sax/properties/entity-expansion"; 065 066 067 /** 068 Parser property to enable strict parsing mode. 069 <pre> 070 http://hairyfatguy.com/sax/properties/strict 071 </pre> 072 False by default. 073 */ 074 public static final String STRICT_PROPERTY = "http://hairyfatguy.com/sax/properties/strict"; 075 076 //########################################################################### 077 // PRIVATE FIELDS 078 //########################################################################### 079 080 private Doctype mDoctype; 081 082 private ContentHandler mContentHandler; 083 private LexicalHandler mLexicalHandler; 084 private DTDHandler mDTDHandler; 085 private ErrorHandler mErrHandler; 086 private EntityResolver mEntityResolver; 087 088 private boolean debug = false; 089 private boolean mExpandEntities = false; 090 private boolean mStrict = false; 091 092 private BufferedReader mBufferedReader; 093 private int mLineCount; 094 private int mColumn; 095 private boolean mEOL; 096 097 private char[] mTempBuffer = new char[100]; 098 099 private int mState; 100 private Stack mTagStack; 101 private Stack mStateStack; 102 private int mDepth; 103 private Map mNamespaceMap; 104 private Stack<Namespace> mDefaultNamespaceStack; 105 106 private boolean mLenientHTMLParsing; 107 private Collection<String> mLenientHTMLEmptyTags = sLenientHTMLEmptyTags; 108 109 // Enumerated parser states 110 private static final int INITIAL = 0; 111 private static final int IN_START_OF_TAG = 1; 112 private static final int IN_XML_DECLARATION = 2; 113 private static final int IN_DOCTYPE = 3; 114 private static final int IN_START_TAG = 4; 115 private static final int IN_END_TAG = 5; 116 private static final int IN_CONTENT = 6; 117 private static final int IN_ENTITY = 7; 118 private static final int IN_CDATA = 8; 119 private static final int IN_COMMENT = 9; 120 private static final int IN_EMPTY_TAG = 10; 121 private static final int IN_MIDDLE_OF_START_TAG = 11; 122 private static final int IN_ATT_NAME = 12; 123 private static final int IN_ATT_VALUE = 13; 124 private static final int DONE = 14; 125 126 private static final int TAG_START = '<'; 127 private static final int TAG_END = '>'; 128 private static final int ENTITY_START = '&'; 129 private static int MAX_ENTITY_LENGTH = 12; 130 131 private static String XMLNS = "xmlns".intern(); 132 133 private static final String LEXICAL_HANDLER_PROPERTY = "http://xml.org/sax/properties/lexical-handler"; 134 135 136 private static Set<String> sLenientHTMLEmptyTags; 137 private static Set<String> sLenientHTMLTags; 138 139 static 140 { 141 sLenientHTMLEmptyTags = new HashSet<>(10); 142 sLenientHTMLEmptyTags.add("br"); 143 sLenientHTMLEmptyTags.add("img"); 144 sLenientHTMLEmptyTags.add("link"); 145 sLenientHTMLEmptyTags.add("meta"); 146 147 sLenientHTMLTags = new HashSet<>(10); 148 sLenientHTMLTags.add("p"); 149 } 150 151 //########################################################################### 152 // CONSTRUCTORS 153 //########################################################################### 154 155 //--------------------------------------------------------------------------- 156 public SaxyParser() 157 { 158 super(); 159 } 160 161 //########################################################################### 162 // PUBLIC METHODS 163 //########################################################################### 164 165 //---------------------------------------------------------------------- 166 public SaxyParser setLenientHTMLParsing(boolean inValue) 167 { 168 mLenientHTMLParsing = inValue; 169 return this; 170 } 171 172 //---------------------------------------------------------------------- 173 public SaxyParser setLenientHTMLEmptyTags(Collection<String> inEmptyTags) 174 { 175 mLenientHTMLEmptyTags = inEmptyTags; 176 return this; 177 } 178 179 //--------------------------------------------------------------------------- 180 /** 181 After parsing, the Doctype (if one was set) can be retrieved here. 182 */ 183 public Doctype getDoctype() 184 { 185 return mDoctype; 186 } 187 188 // METHODS REQUIRED TO EXTEND SaxParser 189 190 //--------------------------------------------------------------------------- 191 @SuppressWarnings("deprecation") 192 public Parser getParser() 193 throws SAXException 194 { 195 throw new SAXException("getParser() not supported. User getXMLReader()."); 196 } 197 198 //--------------------------------------------------------------------------- 199 public XMLReader getXMLReader() 200 { 201 return this; 202 } 203 204 //--------------------------------------------------------------------------- 205 public boolean isNamespaceAware() 206 { 207 return true; 208 } 209 210 //--------------------------------------------------------------------------- 211 public boolean isValidating() 212 { 213 return false; 214 } 215 216 //--------------------------------------------------------------------------- 217 public void setProperty(String inName, Object inValue) 218 throws SAXNotRecognizedException 219 { 220 if (inName.equals(LEXICAL_HANDLER_PROPERTY)) 221 { 222 setLexicalHandler((LexicalHandler)inValue); 223 } 224 else if (inName.equals(ENTITY_EXPANSION_PROPERTY)) 225 { 226 mExpandEntities = BooleanUtil.valueOf(inValue); 227 } 228 else if (inName.equals(STRICT_PROPERTY)) 229 { 230 mStrict = BooleanUtil.valueOf(inValue); 231 } 232 else 233 { 234 throw new SAXNotRecognizedException("Property '" + inName + "' not supported."); 235 } 236 } 237 238 //--------------------------------------------------------------------------- 239 public Object getProperty(String inName) 240 throws SAXNotRecognizedException 241 { 242 Object obj; 243 244 if (inName.equals(LEXICAL_HANDLER_PROPERTY)) 245 { 246 obj = getLexicalHandler(); 247 } 248 else if (inName.equals(ENTITY_EXPANSION_PROPERTY)) 249 { 250 obj = new Boolean(mExpandEntities); 251 } 252 else if (inName.equals(STRICT_PROPERTY)) 253 { 254 obj = new Boolean(mStrict); 255 } 256 else 257 { 258 throw new SAXNotRecognizedException("Property '" + inName + "' not supported."); 259 } 260 261 return obj; 262 } 263 264 // METHODS REQUIRED BY THE XMLReader interface 265 266 //--------------------------------------------------------------------------- 267 public ContentHandler getContentHandler() 268 { 269 return mContentHandler; 270 } 271 272 //--------------------------------------------------------------------------- 273 public void setContentHandler(ContentHandler handler) 274 { 275 mContentHandler = handler; 276 } 277 278 //--------------------------------------------------------------------------- 279 public DTDHandler getDTDHandler() 280 { 281 return mDTDHandler; 282 } 283 284 //--------------------------------------------------------------------------- 285 public void setDTDHandler(DTDHandler handler) 286 { 287 mDTDHandler = handler; 288 } 289 290 //--------------------------------------------------------------------------- 291 public ErrorHandler getErrorHandler() 292 { 293 return mErrHandler; 294 } 295 296 //--------------------------------------------------------------------------- 297 public void setErrorHandler(ErrorHandler handler) 298 { 299 mErrHandler = handler; 300 } 301 302 //--------------------------------------------------------------------------- 303 public EntityResolver getEntityResolver() 304 { 305 return mEntityResolver; 306 } 307 308 //--------------------------------------------------------------------------- 309 public void setEntityResolver(EntityResolver resolver) 310 { 311 mEntityResolver = resolver; 312 } 313 314 //--------------------------------------------------------------------------- 315 public void setFeature(String inName, boolean inValue) 316 throws SAXNotRecognizedException 317 { 318 if (inName.equals("http://xml.org/sax/features/namespaces")) 319 { 320 // Namespaces is always on. 321 } 322 else if (inName.equals("http://xml.org/sax/features/namespace-prefixes")) 323 { 324 // Always set to false. 325 } 326 else 327 { 328 throw new SAXNotRecognizedException("Feature '" + inName + "' not supported."); 329 } 330 } 331 332 //--------------------------------------------------------------------------- 333 public boolean getFeature(String inName) 334 throws SAXNotRecognizedException 335 { 336 boolean outValue = false; 337 338 if (inName.equals("http://xml.org/sax/features/namespaces")) 339 { 340 outValue = true; 341 } 342 else if (inName.equals("http://xml.org/sax/features/namespace-prefixes")) 343 { 344 outValue = false; 345 } 346 else 347 { 348 throw new SAXNotRecognizedException("Feature '" + inName + "' not supported."); 349 } 350 351 return outValue; 352 } 353/* 354 //--------------------------------------------------------------------------- 355 public void setLocale(Locale locale) 356 throws SAXException 357 { 358 throw new SAXException("setLocale() is not supported by this parser."); 359 } 360*/ 361 362 363 //--------------------------------------------------------------------------- 364 public LexicalHandler getLexicalHandler() 365 { 366 return mLexicalHandler; 367 } 368 369 //--------------------------------------------------------------------------- 370 public void setLexicalHandler(LexicalHandler handler) 371 { 372 mLexicalHandler = handler; 373 } 374 375 376 //--------------------------------------------------------------------------- 377 public void parse(String inSystemId) 378 throws SAXException, IOException 379 { 380 parse(new InputSource(inSystemId)); 381 } 382 383 //--------------------------------------------------------------------------- 384 public XMLNamespace getCurrentDefaultNamespace() 385 { 386 return (mDefaultNamespaceStack.size() > 0 ? XMLNamespace.getNamespace(mDefaultNamespaceStack.peek().getURI()) : null); 387 } 388 389 //--------------------------------------------------------------------------- 390 private int nextChar() 391 throws IOException 392 { 393 int c = mBufferedReader.read(); 394 if (c == -1) throw new EOFException(); 395 396 // Need to map \r, \r\n, and \n to \n 397 // See XML spec section 2.11 398 if (c <= '\n') 399 { 400 if (c == '\n' && mEOL) 401 { 402 mEOL = false; 403 c = nextChar(); 404 } 405 else if (mEOL) 406 { 407 mEOL = false; 408 } 409 else if (c == '\n') 410 { 411 mLineCount++; 412 mColumn = 0; 413 } 414 else if (c == '\r') 415 { 416 mEOL = true; 417 c = '\n'; 418 mLineCount++; 419 mColumn = 0; 420 } 421 else 422 { 423 mColumn++; 424 } 425 } 426 else 427 { 428 mColumn++; 429 } 430 431 return c; 432 } 433 434 //--------------------------------------------------------------------------- 435 private void skipEndTag(int c) 436 throws SAXException, IOException 437 { 438 String openTag; 439 440 try 441 { 442 openTag = (String) mTagStack.peek(); 443 } 444 catch (EmptyStackException e) 445 { 446 StringBuffer extraEndTag = new StringBuffer(c); 447 while ((c = nextChar()) != TAG_END) 448 { 449 extraEndTag.append(c); 450 } 451 throw new SAXException("End tag '" + extraEndTag + "' didn't have a matching start tag!"); 452 } 453 454 mTempBuffer[0] = (char)c; 455 int bytesToRead = openTag.length(); 456 int bytesRead = mBufferedReader.read(mTempBuffer, 1, bytesToRead); 457 // Calls to read() are not guaranteed to read the full requested length. 458 // Allow for partial reads. 459 while (bytesRead < bytesToRead) 460 { 461 if (bytesRead <= 0) 462 { 463 throw new SAXException("Expected end tag '" + openTag + "' but reached the end of the file."); 464 } 465 466 bytesRead += mBufferedReader.read(mTempBuffer, bytesRead + 1, bytesToRead - bytesRead); 467 } 468 469 mColumn += bytesToRead; 470 471 if (! openTag.equals(new String(mTempBuffer, 0, openTag.length()))) 472 { 473 if (! mLenientHTMLParsing || ! sLenientHTMLTags.contains(openTag.toLowerCase())) 474 { 475 throw new SAXException("Expected end tag '" + openTag + "' but found '" 476 + (new String(mTempBuffer, 0, openTag.length())) + "' instead!" 477 + " Line: " + mLineCount + " Col: " + mColumn); 478 } 479 } 480 481 if (mTempBuffer[bytesToRead] != TAG_END) 482 { 483 throw new SAXException("Problem with end tag '" + mTempBuffer[bytesToRead] 484 + "'! Line: " + mLineCount + " Col: " + mColumn); 485 } 486 } 487 488 //--------------------------------------------------------------------------- 489 public void parse(InputSource source) 490 throws SAXException, IOException 491 { 492 initNamespaceStructures(); 493 494 mStateStack = new Stack(); 495 mTagStack = new Stack(); 496 mDepth = 0; 497 XMLStringBuffer buffer = new XMLStringBuffer(); 498 XMLStringBuffer entity = new XMLStringBuffer(); 499 String tagName = null; 500 String tagURI = null; 501 String tagLocalName = null; 502 AttributesImpl attributes = new AttributesImpl(); 503 String attName = null; 504 mLineCount = 1; 505 mColumn = 0; 506 mEOL = false; 507 508// int quoteChar = '"'; 509 int quoteChar = -1; 510 511 // I'm not sure of any other way that will work to even out the differences between character and binary sources. 512 try 513 { 514 mBufferedReader = new BufferedReader(source.getCharacterStream(), 8192); 515 } 516 catch (NullPointerException e) 517 { 518 mBufferedReader = new BufferedReader(new InputStreamReader(source.getByteStream()), 8192); 519 } 520 521 mContentHandler.startDocument(); 522 523 mState = INITIAL; 524 pushState(); 525 526 try 527 { 528 int c; 529 while (mState != DONE) 530 { 531 c = nextChar(); 532 533 switch (mState) 534 { 535 case IN_CONTENT: 536 while (true) 537 { 538 // We are in tag content 539 if (c == TAG_START) 540 { 541 pushState(); 542 mState = IN_START_OF_TAG; 543 if (buffer.length() > 0) 544 { 545 if (buffer.hasNonwhitespaceContent()) 546 { 547 mContentHandler.characters(buffer.getCharArray(), 0, buffer.length()); 548 } 549 buffer.clear(); 550 } 551 break; 552 } 553 else if (c == ENTITY_START) 554 { 555 pushState(); 556 mState = IN_ENTITY; 557 entity.clear(); 558 break; 559 } 560 else 561 { 562 buffer.append((char) c); 563 } 564 565 c = nextChar(); 566 } 567 break; 568 569 570 571 case INITIAL: 572 while (true) 573 { 574 if (c == -1) 575 { 576 mState = DONE; 577 break; 578 } 579 else if (c == TAG_START) 580 { 581 pushState(); 582 mState = IN_START_OF_TAG; 583 break; 584 } 585 else if (!Character.isWhitespace((char) c)) 586 { 587 throw new SAXException("Unexpected text outside of tags (" + (char) c + ")!" 588 + " Line: " + mLineCount + " Col: " + mColumn); 589 } 590 591 c = nextChar(); 592 } 593 break; 594 595 596 597 case IN_START_OF_TAG: 598 // What type of tag does it look like we've wandered into? 599 if (c == '/') 600 { 601 popState(); 602 mState = IN_END_TAG; 603 tagName = null; 604 } 605 else if (c == '?') 606 { 607 mState = IN_XML_DECLARATION; 608 } 609 else 610 { 611 mState = IN_START_TAG; 612 tagName = null; 613 attributes.clear(); 614 buffer.append((char) c); 615 } 616 617 break; 618 619 620 case IN_START_TAG: 621 while (true) 622 { 623 624 if (Character.isWhitespace((char) c)) 625 { 626 tagName = buffer.toString(); 627 pushDepth(tagName); 628 buffer.clear(); 629 mState = IN_MIDDLE_OF_START_TAG; 630 break; 631 } 632 else if (c == TAG_END) 633 { 634 if (null == tagName) tagName = buffer.toString(); 635 pushDepth(tagName); 636 tagURI = getNamespaceURI(getNamespacePrefix(tagName)); 637 tagLocalName = getLocalName(tagName); 638 mContentHandler.startElement(tagURI, 639 tagLocalName, 640 tagName, 641 attributes); 642 643 if (mLenientHTMLEmptyTags != null 644 && mLenientHTMLEmptyTags.contains(tagLocalName.toLowerCase())) 645 { 646 mContentHandler.endElement(tagURI, 647 tagLocalName, 648 tagName); 649 mTagStack.pop(); 650 buffer.clear(); 651 popDepth(); 652 popState(); 653 654 mState = INITIAL; 655 } 656 else 657 { 658 mState = IN_CONTENT; 659 } 660 661 buffer.clear(); 662 break; 663 } 664 else if (c == '/') 665 { 666 if (null == tagName) tagName = buffer.toString(); 667 pushDepth(tagName); 668 mState = IN_EMPTY_TAG; 669 break; 670 } 671 else if (c == '-' 672 && buffer.toString().equals("!-")) 673 { 674 mState = IN_COMMENT; 675 break; 676 } 677 else if (c == '[' 678 && buffer.toString().equals("![CDATA")) 679 { 680 mState = IN_CDATA; 681 if (mLexicalHandler != null) mLexicalHandler.startCDATA(); 682 buffer.clear(); 683 break; 684 } 685 else if (c == 'E' 686 && buffer.toString().equals("!DOCTYP")) 687 { 688 mState = IN_DOCTYPE; 689// buffer.clear(); 690 buffer.prepend("<"); 691 buffer.append((char) c); 692 break; 693 } 694 else 695 { 696 buffer.append((char) c); 697 } 698 699 c = nextChar(); 700 } 701 break; 702 703 704 705 case IN_MIDDLE_OF_START_TAG: 706 while (true) 707 { 708 if (c == TAG_END) 709 { 710 tagURI = getNamespaceURI(getNamespacePrefix(tagName)); 711 tagLocalName = getLocalName(tagName); 712 mContentHandler.startElement(tagURI, 713 tagLocalName, 714 tagName, 715 attributes); 716 717 if (mLenientHTMLEmptyTags != null 718 && mLenientHTMLEmptyTags.contains(tagLocalName.toLowerCase())) 719 { 720 mContentHandler.endElement(tagURI, 721 tagLocalName, 722 tagName); 723 mTagStack.pop(); 724 popDepth(); 725 popState(); 726 727 mState = INITIAL; 728 } 729 else 730 { 731 mState = IN_CONTENT; 732 } 733 734 buffer.clear(); 735 break; 736 } 737 else if (c == '/') 738 { 739 mState = IN_EMPTY_TAG; 740 break; 741 } 742 else if (!Character.isWhitespace((char) c)) 743 { 744 mState = IN_ATT_NAME; 745 attName = null; 746 // buffer.clear(); 747 buffer.append((char) c); 748 break; 749 } 750 751 c = nextChar(); 752 } 753 break; 754 755 756 757 case IN_ATT_NAME: 758 while (true) 759 { 760 if (c == '=') 761 { 762 attName = buffer.toString(); 763 mState = IN_ATT_VALUE; 764 buffer.clear(); 765 break; 766 } 767 else 768 { 769 buffer.append((char) c); 770 } 771 772 c = nextChar(); 773 } 774 break; 775 776 777 case IN_ATT_VALUE: 778 if (quoteChar == -1) 779 { 780 if (c == '"' || c == '\'') 781 { 782 quoteChar = c; 783 c = nextChar(); 784 } 785 else if (! mLenientHTMLParsing) 786 { 787 throw new SAXException("Improper attribute construction. Expected a quote character!" + mLineCount); 788 } 789 } 790 791 while (true) 792 { 793 if (c == quoteChar 794 || (quoteChar == -1 && "> \t\r\n".contains(((char)c) + ""))) 795 { 796 String attValue = buffer.toString(); 797 //debugMsg("attName: '" + attName + "'"); 798 if (attName.startsWith("xmlns:") 799 || attName.equals(XMLNS)) 800 { 801 extractNamespaceDeclaration(attName, attValue); 802 } 803 attributes.addAttribute("", attName, attName, "CDATA", attValue); 804 805 buffer.clear(); 806 quoteChar = -1; 807 808 if (c == '>') 809 { 810 tagURI = getNamespaceURI(getNamespacePrefix(tagName)); 811 tagLocalName = getLocalName(tagName); 812 mContentHandler.startElement(tagURI, 813 tagLocalName, 814 tagName, 815 attributes); 816 817 mState = IN_CONTENT; 818 } 819 else 820 { 821 mState = IN_MIDDLE_OF_START_TAG; 822 } 823 824 break; 825 } 826 else if (c == ENTITY_START) 827 { 828 pushState(); 829 mState = IN_ENTITY; 830 entity.clear(); 831 break; 832 } 833 else if (" \r\n\u0009".indexOf(c) >= 0) 834 { 835 buffer.append(' '); 836 } 837 else 838 { 839 buffer.append((char) c); 840 } 841 842 c = nextChar(); 843 } 844 break; 845 846 847 case IN_EMPTY_TAG: 848 if (c != TAG_END) 849 { 850 throw new SAXException("Expected '>' for tag <" + tagName + "/>" 851 + " Line: " + mLineCount + " Col: " + mColumn); 852 } 853 tagURI = getNamespaceURI(getNamespacePrefix(tagName)); 854 tagLocalName = getLocalName(tagName); 855 mContentHandler.startElement(tagURI, 856 tagLocalName, 857 tagName, 858 attributes); 859 mContentHandler.endElement(tagURI, 860 tagLocalName, 861 tagName); 862 mTagStack.pop(); 863 buffer.clear(); 864 popDepth(); 865 popState(); 866 break; 867 868 869 case IN_END_TAG: 870 // XXXXXXXXXXXXX The ending tag better be the one currently open. 871 // (Otherwise it's an error.) Try to take advantage of this precognition. 872 //String openTag = (String) mTagStack.pop(); 873 skipEndTag(c); 874 tagName = (String) mTagStack.pop(); 875 mContentHandler.endElement(tagURI, 876 getLocalName(tagName), 877 tagName); 878 buffer.clear(); 879 popDepth(); 880 popState(); 881 882/* 883 while (true) 884 { 885 // We are in an end tag 886 if (c == TAG_END) 887 { 888 tagName = buffer.toString(); 889 if (mDepth == 0) 890 { 891 throw new SAXException("Closing tag '</" + tagName + ">' " 892 + "had no opening tag!" 893 + " Line: " + mLineCount + " Col: " + mColumn); 894 } 895 mContentHandler.endElement(tagURI, 896 tagLocalName, 897 tagName); 898 buffer.clear(); 899 popDepth(tagName); 900 popState(); 901 break; 902 } 903 else 904 { 905 buffer.append((char) c); 906 } 907 908 c = nextChar(); 909 } 910*/ 911 break; 912 913 914 case IN_ENTITY: 915 boolean recoverFromInvalidEntity = false; 916 917 while (true) 918 { 919 if (c == ';') 920 { 921 String expandedEntity = expandEntity(entity.toString()); 922 buffer.append((expandedEntity != null ? expandedEntity : "&" + entity.toString() + ";")); 923 entity.clear(); 924 popState(); 925 break; 926 } 927 else if (Character.isWhitespace((char) c)) 928 { 929 if (mStrict) 930 { 931 throw new SAXException("Character entity contains whitespace !?" 932 + " Line: " + mLineCount + " Col: " + mColumn); 933 } 934 else 935 { 936 recoverFromInvalidEntity = true; 937 break; 938 } 939 } 940 else if (c == '"' || c == '\'' || c == '>') 941 { 942 if (mStrict) 943 { 944 throw new SAXException("Character entity contains illegal character !?" 945 + " Line: " + mLineCount + " Col: " + mColumn); 946 } 947 else 948 { 949 recoverFromInvalidEntity = true; 950 break; 951 } 952 } 953 else if (entity.length() >= MAX_ENTITY_LENGTH) 954 { 955 // Either the entity was malformed or the ampersand should have been escaped. 956 if (mStrict) 957 { 958 throw new SAXException("Max entity length (" + MAX_ENTITY_LENGTH + ") exceeded!?" 959 + " Line: " + mLineCount + " Col: " + mColumn); 960 } 961 else 962 { 963 recoverFromInvalidEntity = true; 964 break; 965 } 966 } 967 else 968 { 969 entity.append((char) c); 970 } 971 972 c = nextChar(); 973 } 974 975 if (recoverFromInvalidEntity) 976 { 977 entity.append((char) c); 978 buffer.append("&" + entity.toString()); 979 entity.clear(); 980 popState(); 981 982 if (IN_ATT_VALUE == mState) 983 { 984 if (c == quoteChar) 985 { 986 String attValue = buffer.toString(); 987 //debugMsg("attName: '" + attName + "'"); 988 if (attName.startsWith("xmlns:") 989 || attName.equals(XMLNS)) 990 { 991 extractNamespaceDeclaration(attName, attValue); 992 } 993 attributes.addAttribute("", attName, attName, "CDATA", attValue); 994 995 buffer.clear(); 996 quoteChar = -1; 997 mState = IN_MIDDLE_OF_START_TAG; 998 break; 999 } 1000 } 1001 else if (IN_CONTENT == mState) 1002 { 1003 if (c == TAG_START) 1004 { 1005 pushState(); 1006 mState = IN_START_OF_TAG; 1007 if (buffer.length() > 0) 1008 { 1009 if (buffer.hasNonwhitespaceContent()) 1010 { 1011 mContentHandler.characters(buffer.getCharArray(), 0, buffer.length()); 1012 } 1013 buffer.clear(); 1014 } 1015 } 1016 } 1017 } 1018 1019 break; 1020 1021 1022 case IN_CDATA: 1023 while (true) 1024 { 1025 if (c == TAG_END 1026 && buffer.toString().endsWith("]]")) 1027 { 1028 buffer.setLength(buffer.length() - 2); 1029 mContentHandler.characters(buffer.getCharArray(), 0, buffer.length()); 1030 if (mLexicalHandler != null) mLexicalHandler.endCDATA(); 1031 popState(); 1032 buffer.clear(); 1033 break; 1034 } 1035 else 1036 { 1037 buffer.append((char) c); 1038 } 1039 1040 c = nextChar(); 1041 } 1042 break; 1043 1044 1045 1046 case IN_COMMENT: 1047 while (true) 1048 { 1049 // Inside a comment: <!-- --> 1050 if (c == TAG_END 1051 && buffer.toString().endsWith("--")) 1052 { 1053 if (mLexicalHandler != null) mLexicalHandler.comment(buffer.getCharArray(), 2, buffer.length() - 4); 1054 popState(); 1055 buffer.clear(); 1056 break; 1057 } 1058 else 1059 { 1060 buffer.append((char) c); 1061 } 1062 1063 c = nextChar(); 1064 } 1065 break; 1066 1067 1068 case IN_XML_DECLARATION: 1069 while (true) 1070 { 1071 buffer.append((char) c); 1072 if (c == TAG_END) 1073 { 1074 popState(); 1075 if (mState == DONE) mState = INITIAL; 1076 buffer.clear(); 1077 break; 1078 } 1079 1080 c = nextChar(); 1081 } 1082 break; 1083 1084 case IN_DOCTYPE: 1085 while (true) 1086 { 1087 buffer.append((char) c); 1088 if (c == TAG_END) 1089 { 1090 popState(); 1091 if (mState == DONE) mState = INITIAL; 1092 mDoctype = Doctype.valueOf(buffer.toString()); 1093 buffer.clear(); 1094 break; 1095 } 1096 1097 c = nextChar(); 1098 } 1099 break; 1100 } 1101 } 1102 1103 } 1104 catch (EOFException e) 1105 { 1106 // Ignore. 1107 } 1108 1109 if (mDepth != 0) 1110 { 1111 throw new SAXException("XML Document is not properly ended! " 1112 + "Remaining depth= " + mDepth); 1113 } 1114 else if (mState != INITIAL 1115 && mState != DONE) 1116 { 1117 throw new SAXException("Parser ended in bad state (" + mState + ")!"); 1118 } 1119 else 1120 { 1121 mContentHandler.endDocument(); 1122 } 1123 1124 } 1125 1126 //########################################################################### 1127 // PRIVATE METHODS 1128 //########################################################################### 1129 1130 1131 //--------------------------------------------------------------------------- 1132 private void debugMsg(String inMsg) 1133 { 1134 if (debug) 1135 { 1136 System.err.println(inMsg); 1137 } 1138 } 1139 1140 //--------------------------------------------------------------------------- 1141 private String expandEntity(String inEntity) 1142 throws SAXException 1143 { 1144 String expandedEntity = null; 1145 1146 if (mExpandEntities) 1147 { 1148 if (inEntity.startsWith("#")) 1149 { 1150 if (inEntity.charAt(1) == 'x') 1151 { 1152 // Hex 1153 expandedEntity = "" + (char) Integer.parseInt(inEntity.substring(2), 16); 1154 } 1155 else 1156 { 1157 // Decimal 1158 expandedEntity = "" + (char) Integer.parseInt(inEntity.substring(1)); 1159 } 1160 } 1161 else 1162 { 1163 // Try to resolve it with our battery of standard entity classes. 1164 1165 expandedEntity = SpecialCharacterEntities.resolveEntity(inEntity); 1166 1167 if (null == expandedEntity) 1168 { 1169 expandedEntity = Latin1Entities.getInstance().getNumericEntity(inEntity); 1170 } 1171 1172 if (null == expandedEntity) 1173 { 1174 expandedEntity = SymbolEntities.getInstance().getNumericEntity(inEntity); 1175 } 1176 1177 1178 if (expandedEntity != null 1179 && expandedEntity.startsWith("#")) 1180 { 1181 expandedEntity = "" + (char) Integer.parseInt(expandedEntity.substring(1)); 1182 } 1183 } 1184 1185 // Have we resolved it yet? If not, use the custom entity resolver if one 1186 // has been specified. 1187 if (null == expandedEntity) 1188 { 1189 if (mEntityResolver != null) 1190 { 1191 InputStream inStream = null; 1192 1193 try 1194 { 1195 InputSource expandedEntitySource = mEntityResolver.resolveEntity(null, inEntity); 1196 if (expandedEntitySource != null) 1197 { 1198 inStream = expandedEntitySource.getByteStream(); 1199 StringBuffer entityBuffer = new StringBuffer(); 1200 1201 int c = 0; 1202 while ((c = inStream.read()) != -1) 1203 { 1204 entityBuffer.append((char) c); 1205 } 1206 1207 expandedEntity = entityBuffer.toString(); 1208 } 1209 else 1210 { 1211 throw new SAXException("'" + inEntity + "' is not a recognized entity."); 1212 } 1213 } 1214 catch (IOException e) 1215 { 1216 throw new SAXException(e); 1217 } 1218 finally 1219 { 1220 if (inStream != null) 1221 { 1222 try 1223 { 1224 inStream.close(); 1225 } 1226 catch (IOException e) 1227 { 1228 } 1229 } 1230 } 1231 } 1232 } 1233 } 1234 1235 return expandedEntity; 1236 } 1237 1238 //--------------------------------------------------------------------------- 1239 private Attributes generateAttributeList(HashMap inAttributes) 1240 { 1241 // NOTE: WE ARE NOT CURRENTLY SUPPORTING ATTRIBUTE TYPES XXXXXXXXXXXXXX 1242 // The attribute type is one of the strings "CDATA", "ID", "IDREF", "IDREFS", 1243 // "NMTOKEN", "NMTOKENS", "ENTITY", "ENTITIES", or "NOTATION" (always in upper case). 1244 // 1245 // If the parser has not read a declaration for the attribute, or if the parser 1246 // does not report attribute types, then it must return the value "CDATA" as 1247 // stated in the XML 1.0 Recommentation (clause 3.3.3, "Attribute-Value Normalization"). 1248 // 1249 // For an enumerated attribute that is not a notation, the parser will report 1250 // the type as "NMTOKEN". 1251 String attType = "CDATA"; 1252 1253 AttributesImpl attList = new AttributesImpl(); 1254 1255 if (inAttributes != null) 1256 { 1257 Iterator iter = inAttributes.entrySet().iterator(); 1258 while (iter.hasNext()) 1259 { 1260 Map.Entry att = (Map.Entry) iter.next(); 1261 attList.addAttribute("", (String) att.getKey(), (String) att.getKey(), attType, (String) att.getValue()); 1262 } 1263 } 1264 1265 return attList; 1266 } 1267 1268 //--------------------------------------------------------------------------- 1269 private void initNamespaceStructures() 1270 { 1271 mNamespaceMap = new HashMap(); 1272 mDefaultNamespaceStack = null; 1273 } 1274 1275 //--------------------------------------------------------------------------- 1276 private void pushState() 1277 { 1278 mStateStack.push(new Integer(mState)); 1279 //debugMsg(mState + " pushed onto the StateStack"); 1280 } 1281 1282 //--------------------------------------------------------------------------- 1283 private void popState() 1284 { 1285 mState = ((Integer) mStateStack.pop()).intValue(); 1286 //debugMsg(mState + " popped off the StateStack"); 1287 } 1288 1289 //--------------------------------------------------------------------------- 1290 private void pushDepth(String inTagName) 1291 { 1292 mDepth++; 1293 //debugMsg("Depth pushed to " + mDepth); 1294 1295 mTagStack.push(inTagName); 1296 } 1297 1298 //--------------------------------------------------------------------------- 1299 private void popDepth() 1300 throws SAXException 1301 { 1302 //debugMsg("Depth popped to " + (mDepth - 1)); 1303 if (mNamespaceMap.size() > 0) 1304 { 1305 // Can't use an iterator because we might delete the map entry and that would 1306 // cause a ConcurrentModificaitonException. 1307 Object[] prefixes = mNamespaceMap.keySet().toArray(); 1308 for (int i = 0; i < prefixes.length; i++) 1309 { 1310 String prefix = (String) prefixes[i]; 1311 Stack namespaceStack = (Stack) mNamespaceMap.get(prefix); 1312 // There must be at least one value on the stack or we should have deleted 1313 // the map entry for the prefix. 1314 Namespace namespace = (Namespace) namespaceStack.peek(); 1315 if (namespace.getTagLevel() >= mDepth) 1316 { 1317 namespaceStack.pop(); 1318 if (namespaceStack.size() == 0) 1319 { 1320 mNamespaceMap.remove(prefix); 1321 } 1322 } 1323 } 1324 } 1325 1326 // Adjust the default namespace stack if necessary. 1327 if (mDefaultNamespaceStack != null 1328 && mDefaultNamespaceStack.size() > 0) 1329 { 1330 Namespace namespace = (Namespace) mDefaultNamespaceStack.peek(); 1331 if (namespace.getTagLevel() >= mDepth) 1332 { 1333 mDefaultNamespaceStack.pop(); 1334 } 1335 } 1336 1337 1338 1339 mDepth--; 1340 1341/* String openTag = (String) mTagStack.pop(); 1342 if (!inTagName.equals(openTag)) 1343 { 1344 throw new SAXException("Tag mismatch! Tag '" + openTag + "' was open " 1345 + "but the closing tag was '" + inTagName + "'!"); 1346 } 1347*/ 1348 if (mDepth == 0) popState(); 1349 } 1350 1351 //--------------------------------------------------------------------------- 1352 private void extractNamespaceDeclaration(String inName, String inURI) 1353 { 1354 int index = inName.indexOf(":"); 1355 if (index > 0) 1356 { 1357 String prefix = inName.substring(index + 1); 1358 //debugMsg("Adding namespace '" + prefix + "': " + inURI); 1359 Stack namespaceStack = (Stack) mNamespaceMap.get(prefix); 1360 if (null == namespaceStack) 1361 { 1362 namespaceStack = new Stack(); 1363 mNamespaceMap.put(prefix, namespaceStack); 1364 } 1365 namespaceStack.push(new Namespace(inURI, mDepth)); 1366 1367 XMLNamespace.getNamespace(prefix, inURI); // Register the namespace 1368 } 1369 else 1370 { 1371 //debugMsg("Setting default namespace: " + inURI); 1372 if (null == mDefaultNamespaceStack) 1373 { 1374 mDefaultNamespaceStack = new Stack(); 1375 } 1376 mDefaultNamespaceStack.push(new Namespace(inURI, mDepth)); 1377 1378 XMLNamespace.getNamespace(inURI); // Register the namespace 1379 } 1380 1381 } 1382 1383 //--------------------------------------------------------------------------- 1384 private String getNamespaceURI(String inPrefix) 1385 { 1386 String uri = null; 1387 1388 if (inPrefix != null) 1389 { 1390 Stack namespaceStack = (Stack) mNamespaceMap.get(inPrefix); 1391 if (namespaceStack != null) 1392 { 1393 uri = ((Namespace) namespaceStack.peek()).getURI(); 1394 } 1395 } 1396 1397 if (null == uri 1398 && mDefaultNamespaceStack != null 1399 && mDefaultNamespaceStack.size() > 0) 1400 { 1401 uri = ((Namespace)mDefaultNamespaceStack.peek()).getURI(); 1402 } 1403 1404 if (null == uri) uri = ""; 1405 1406 return uri; 1407 } 1408 1409 //--------------------------------------------------------------------------- 1410 private String getNamespacePrefix(String inValue) 1411 { 1412 String returnValue = null; 1413 int i = inValue.indexOf(":"); 1414 if (i > 0) 1415 { 1416 returnValue = inValue.substring(0, i); 1417 } 1418 1419 return returnValue; 1420 } 1421 1422 //--------------------------------------------------------------------------- 1423 private String getLocalName(String inValue) 1424 { 1425 String returnValue = inValue; 1426 int i = inValue.indexOf(":"); 1427 if (i > 0) 1428 { 1429 returnValue = inValue.substring(i + 1); 1430 } 1431 1432 return returnValue; 1433 } 1434 1435 //########################################################################### 1436 // INNER CLASSES 1437 //########################################################################### 1438 1439 1440 private class EOFException extends RuntimeException 1441 { 1442 } 1443 1444 private class Namespace 1445 { 1446 private String mURI; 1447 private int mTagLevel; 1448 1449 //------------------------------------------------------------------------ 1450 public Namespace(String inURI, int inTagLevel) 1451 { 1452 mURI = inURI; 1453 mTagLevel = inTagLevel; 1454 } 1455 1456 //------------------------------------------------------------------------ 1457 public String getURI() 1458 { 1459 return mURI; 1460 } 1461 1462 //------------------------------------------------------------------------ 1463 public int getTagLevel() 1464 { 1465 return mTagLevel; 1466 } 1467 } 1468 1469}