Package jodd.lagarto

Class LagartoParser


  • public class LagartoParser
    extends java.lang.Object
    HTML/XML content parser/tokenizer using TagVisitor for callbacks. Works by the HTML5 specs for tokenization, as described on WhatWG. Differences from the specs:
    • text is emitted as a block of text, and not character by character.
    • tags name case (and letter case of other entities) is not changed, but case-sensitive information exist for matching.
    • the whole tokenization process is implemented here, without going into the tree building. This applies for switching to the RAWTEXT state.
    • script tag is emitted separately
    • conditional comments added
    • xml states and callbacks added
    • Field Detail

      • parsing

        protected boolean parsing
      • DATA_STATE

        protected State DATA_STATE
        Data state.
      • TAG_OPEN

        protected State TAG_OPEN
      • END_TAG_OPEN

        protected State END_TAG_OPEN
      • TAG_NAME

        protected State TAG_NAME
      • BEFORE_ATTRIBUTE_NAME

        protected State BEFORE_ATTRIBUTE_NAME
      • ATTRIBUTE_NAME

        protected State ATTRIBUTE_NAME
      • AFTER_ATTRIBUTE_NAME

        protected State AFTER_ATTRIBUTE_NAME
      • BEFORE_ATTRIBUTE_VALUE

        protected State BEFORE_ATTRIBUTE_VALUE
      • ATTR_VALUE_UNQUOTED

        protected State ATTR_VALUE_UNQUOTED
      • ATTR_VALUE_SINGLE_QUOTED

        protected State ATTR_VALUE_SINGLE_QUOTED
      • ATTR_VALUE_DOUBLE_QUOTED

        protected State ATTR_VALUE_DOUBLE_QUOTED
      • AFTER_ATTRIBUTE_VALUE_QUOTED

        protected State AFTER_ATTRIBUTE_VALUE_QUOTED
      • SELF_CLOSING_START_TAG

        protected State SELF_CLOSING_START_TAG
      • BOGUS_COMMENT

        protected State BOGUS_COMMENT
      • MARKUP_DECLARATION_OPEN

        protected State MARKUP_DECLARATION_OPEN
      • rawTextStart

        protected int rawTextStart
      • rawTextEnd

        protected int rawTextEnd
      • rawTagName

        protected char[] rawTagName
      • RAWTEXT

        protected State RAWTEXT
      • RAWTEXT_LESS_THAN_SIGN

        protected State RAWTEXT_LESS_THAN_SIGN
      • RAWTEXT_END_TAG_OPEN

        protected State RAWTEXT_END_TAG_OPEN
      • RAWTEXT_END_TAG_NAME

        protected State RAWTEXT_END_TAG_NAME
      • rcdataTagStart

        protected int rcdataTagStart
      • rcdataTagName

        protected char[] rcdataTagName
      • RCDATA

        protected State RCDATA
      • RCDATA_LESS_THAN_SIGN

        protected State RCDATA_LESS_THAN_SIGN
      • RCDATA_END_TAG_OPEN

        protected State RCDATA_END_TAG_OPEN
      • RCDATA_END_TAG_NAME

        protected State RCDATA_END_TAG_NAME
      • commentStart

        protected int commentStart
      • COMMENT_START

        protected State COMMENT_START
      • COMMENT_START_DASH

        protected State COMMENT_START_DASH
      • COMMENT

        protected State COMMENT
      • COMMENT_END_DASH

        protected State COMMENT_END_DASH
      • COMMENT_END

        protected State COMMENT_END
      • COMMENT_END_BANG

        protected State COMMENT_END_BANG
      • DOCTYPE

        protected State DOCTYPE
      • BEFORE_DOCTYPE_NAME

        protected State BEFORE_DOCTYPE_NAME
      • DOCTYPE_NAME

        protected State DOCTYPE_NAME
      • AFTER_DOCUMENT_NAME

        protected State AFTER_DOCUMENT_NAME
      • doctypeIdNameStart

        protected int doctypeIdNameStart
      • AFTER_DOCTYPE_PUBLIC_KEYWORD

        protected State AFTER_DOCTYPE_PUBLIC_KEYWORD
      • BEFORE_DOCTYPE_PUBLIC_IDENTIFIER

        protected State BEFORE_DOCTYPE_PUBLIC_IDENTIFIER
      • DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED

        protected State DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
      • DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED

        protected State DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
      • AFTER_DOCTYPE_PUBLIC_IDENTIFIER

        protected State AFTER_DOCTYPE_PUBLIC_IDENTIFIER
      • BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS

        protected State BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
      • BOGUS_DOCTYPE

        protected State BOGUS_DOCTYPE
      • AFTER_DOCTYPE_SYSTEM_KEYWORD

        protected State AFTER_DOCTYPE_SYSTEM_KEYWORD
      • BEFORE_DOCTYPE_SYSTEM_IDENTIFIER

        protected State BEFORE_DOCTYPE_SYSTEM_IDENTIFIER
      • DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED

        protected State DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
      • DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED

        protected State DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
      • AFTER_DOCTYPE_SYSTEM_IDENTIFIER

        protected State AFTER_DOCTYPE_SYSTEM_IDENTIFIER
      • scriptStartNdx

        protected int scriptStartNdx
      • scriptEndNdx

        protected int scriptEndNdx
      • scriptEndTagName

        protected int scriptEndTagName
      • SCRIPT_DATA

        protected State SCRIPT_DATA
      • SCRIPT_DATA_LESS_THAN_SIGN

        protected State SCRIPT_DATA_LESS_THAN_SIGN
      • SCRIPT_DATA_END_TAG_OPEN

        protected State SCRIPT_DATA_END_TAG_OPEN
      • SCRIPT_DATA_END_TAG_NAME

        protected State SCRIPT_DATA_END_TAG_NAME
      • text

        protected char[] text
      • textLen

        protected int textLen
      • attrStartNdx

        protected int attrStartNdx
      • attrEndNdx

        protected int attrEndNdx
      • conditionalCommentStarted

        private boolean conditionalCommentStarted
      • state

        protected State state
      • TAG_WHITESPACES

        private static final char[] TAG_WHITESPACES
      • TAG_WHITESPACES_OR_END

        private static final char[] TAG_WHITESPACES_OR_END
      • CONTINUE_CHARS

        private static final char[] CONTINUE_CHARS
      • ATTR_INVALID_1

        private static final char[] ATTR_INVALID_1
      • ATTR_INVALID_2

        private static final char[] ATTR_INVALID_2
      • ATTR_INVALID_3

        private static final char[] ATTR_INVALID_3
      • ATTR_INVALID_4

        private static final char[] ATTR_INVALID_4
      • COMMENT_DASH

        private static final char[] COMMENT_DASH
      • T_DOCTYPE

        private static final char[] T_DOCTYPE
      • T_SCRIPT

        private static final char[] T_SCRIPT
      • T_XMP

        private static final char[] T_XMP
      • T_STYLE

        private static final char[] T_STYLE
      • T_IFRAME

        private static final char[] T_IFRAME
      • T_NOFRAMES

        private static final char[] T_NOFRAMES
      • T_NOEMBED

        private static final char[] T_NOEMBED
      • T_NOSCRIPT

        private static final char[] T_NOSCRIPT
      • T_TEXTAREA

        private static final char[] T_TEXTAREA
      • T_TITLE

        private static final char[] T_TITLE
      • A_PUBLIC

        private static final char[] A_PUBLIC
      • A_SYSTEM

        private static final char[] A_SYSTEM
      • CDATA

        private static final char[] CDATA
      • CDATA_END

        private static final char[] CDATA_END
      • XML

        private static final char[] XML
      • XML_VERSION

        private static final char[] XML_VERSION
      • XML_ENCODING

        private static final char[] XML_ENCODING
      • XML_STANDALONE

        private static final char[] XML_STANDALONE
      • CC_IF

        private static final char[] CC_IF
      • CC_ENDIF

        private static final char[] CC_ENDIF
      • CC_ENDIF2

        private static final char[] CC_ENDIF2
      • CC_END

        private static final char[] CC_END
      • RAWTEXT_TAGS

        private static final char[][] RAWTEXT_TAGS
      • RCDATA_TAGS

        private static final char[][] RCDATA_TAGS
      • INVALID_CHARS

        private static final char[] INVALID_CHARS
      • _ENDIF

        private static final java.lang.CharSequence _ENDIF
    • Constructor Detail

      • LagartoParser

        public LagartoParser​(LagartoParserConfig parserConfig,
                             char[] input)
        Creates parser on char array.
      • LagartoParser

        public LagartoParser​(char[] input)
        Creates parser on char array.
      • LagartoParser

        public LagartoParser​(LagartoParserConfig parserConfig,
                             java.lang.CharSequence input)
        Creates parser on a char sequence.
      • LagartoParser

        public LagartoParser​(java.lang.CharSequence input)
        Creates parser on a char sequence.
    • Method Detail

      • initialize

        protected void initialize()
        Initializes parser.
      • parse

        public void parse​(TagVisitor visitor)
        Parses content and emits event to provided TagVisitor.
      • consumeCharacterReference

        protected void consumeCharacterReference​(char allowedChar)
      • consumeCharacterReference

        protected void consumeCharacterReference()
      • _consumeCharacterReference

        private void _consumeCharacterReference()
      • _consumeAttrCharacterReference

        private void _consumeAttrCharacterReference()
      • _consumeNumber

        private void _consumeNumber​(int unconsumeNdx)
      • ensureCapacity

        private void ensureCapacity()
      • ensureCapacity

        private void ensureCapacity​(int growth)
      • textEmitChar

        protected void textEmitChar​(char c)
        Emits characters into the local text buffer.
      • textStart

        protected void textStart()
        Resets text buffer.
      • textEmitChars

        protected void textEmitChars​(int from,
                                     int to)
      • textEmitChars

        protected void textEmitChars​(char[] buffer)
      • textWrap

        protected java.lang.CharSequence textWrap()
      • _addAttribute

        private void _addAttribute()
      • _addAttributeWithValue

        private void _addAttributeWithValue()
      • _addAttribute

        private void _addAttribute​(java.lang.CharSequence attrName,
                                   java.lang.CharSequence attrValue)
      • emitTag

        protected void emitTag()
      • emitComment

        protected void emitComment​(int from,
                                   int to)
        Emits a comment. Also checks for conditional comments!
      • emitText

        protected void emitText()
        Emits text if there is some content.
      • emitScript

        protected void emitScript​(int from,
                                  int to)
      • emitDoctype

        protected void emitDoctype()
      • emitXml

        protected void emitXml()
      • emitCData

        protected void emitCData​(java.lang.CharSequence charSequence)
      • errorEOF

        protected void errorEOF()
      • errorInvalidToken

        protected void errorInvalidToken()
      • errorCharReference

        protected void errorCharReference()
      • _error

        protected void _error​(java.lang.String message)
        Prepares error message and reports it to the visitor.
      • isAppropriateTagName

        private boolean isAppropriateTagName​(char[] lowerCaseNameToMatch,
                                             int from,
                                             int to)
      • matchTagName

        private boolean matchTagName​(char[] tagNameLowercase)
      • switchTypeToSelfClosing

        private void switchTypeToSelfClosing()