Class BoilerpipeHTMLContentHandler

  • All Implemented Interfaces:
    org.xml.sax.ContentHandler

    public class BoilerpipeHTMLContentHandler
    extends java.lang.Object
    implements org.xml.sax.ContentHandler
    A simple SAX ContentHandler, used by BoilerpipeSAXInput. Can be used by different parser implementations, e.g. NekoHTML and TagSoup.
    • Field Detail

      • tagActions

        private final java.util.Map<java.lang.String,​TagAction> tagActions
      • title

        private java.lang.String title
      • tokenBuffer

        java.lang.StringBuilder tokenBuffer
      • textBuffer

        java.lang.StringBuilder textBuffer
      • inBody

        int inBody
      • inAnchor

        int inAnchor
      • inIgnorableElement

        int inIgnorableElement
      • tagLevel

        int tagLevel
      • blockTagLevel

        int blockTagLevel
      • sbLastWasWhitespace

        boolean sbLastWasWhitespace
      • textElementIdx

        private int textElementIdx
      • textBlocks

        private final java.util.List<TextBlock> textBlocks
      • lastStartTag

        private java.lang.String lastStartTag
      • lastEndTag

        private java.lang.String lastEndTag
      • offsetBlocks

        private int offsetBlocks
      • currentContainedTextElements

        private java.util.BitSet currentContainedTextElements
      • flush

        private boolean flush
      • inAnchorText

        boolean inAnchorText
      • labelStacks

        java.util.LinkedList<java.util.LinkedList<LabelAction>> labelStacks
      • fontSizeStack

        java.util.LinkedList<java.lang.Integer> fontSizeStack
      • PAT_VALID_WORD_CHARACTER

        private static final java.util.regex.Pattern PAT_VALID_WORD_CHARACTER
    • Method Detail

      • recycle

        public void recycle()
        Recycles this instance.
      • endDocument

        public void endDocument()
                         throws org.xml.sax.SAXException
        Specified by:
        endDocument in interface org.xml.sax.ContentHandler
        Throws:
        org.xml.sax.SAXException
      • endPrefixMapping

        public void endPrefixMapping​(java.lang.String prefix)
                              throws org.xml.sax.SAXException
        Specified by:
        endPrefixMapping in interface org.xml.sax.ContentHandler
        Throws:
        org.xml.sax.SAXException
      • ignorableWhitespace

        public void ignorableWhitespace​(char[] ch,
                                        int start,
                                        int length)
                                 throws org.xml.sax.SAXException
        Specified by:
        ignorableWhitespace in interface org.xml.sax.ContentHandler
        Throws:
        org.xml.sax.SAXException
      • processingInstruction

        public void processingInstruction​(java.lang.String target,
                                          java.lang.String data)
                                   throws org.xml.sax.SAXException
        Specified by:
        processingInstruction in interface org.xml.sax.ContentHandler
        Throws:
        org.xml.sax.SAXException
      • setDocumentLocator

        public void setDocumentLocator​(org.xml.sax.Locator locator)
        Specified by:
        setDocumentLocator in interface org.xml.sax.ContentHandler
      • skippedEntity

        public void skippedEntity​(java.lang.String name)
                           throws org.xml.sax.SAXException
        Specified by:
        skippedEntity in interface org.xml.sax.ContentHandler
        Throws:
        org.xml.sax.SAXException
      • startDocument

        public void startDocument()
                           throws org.xml.sax.SAXException
        Specified by:
        startDocument in interface org.xml.sax.ContentHandler
        Throws:
        org.xml.sax.SAXException
      • startPrefixMapping

        public void startPrefixMapping​(java.lang.String prefix,
                                       java.lang.String uri)
                                throws org.xml.sax.SAXException
        Specified by:
        startPrefixMapping in interface org.xml.sax.ContentHandler
        Throws:
        org.xml.sax.SAXException
      • startElement

        public void startElement​(java.lang.String uri,
                                 java.lang.String localName,
                                 java.lang.String qName,
                                 org.xml.sax.Attributes atts)
                          throws org.xml.sax.SAXException
        Specified by:
        startElement in interface org.xml.sax.ContentHandler
        Throws:
        org.xml.sax.SAXException
      • endElement

        public void endElement​(java.lang.String uri,
                               java.lang.String localName,
                               java.lang.String qName)
                        throws org.xml.sax.SAXException
        Specified by:
        endElement in interface org.xml.sax.ContentHandler
        Throws:
        org.xml.sax.SAXException
      • characters

        public void characters​(char[] ch,
                               int start,
                               int length)
                        throws org.xml.sax.SAXException
        Specified by:
        characters in interface org.xml.sax.ContentHandler
        Throws:
        org.xml.sax.SAXException
      • getTextBlocks

        java.util.List<TextBlock> getTextBlocks()
      • flushBlock

        public void flushBlock()
      • addTextBlock

        protected void addTextBlock​(TextBlock tb)
      • isWord

        private static boolean isWord​(java.lang.String token)
      • getTitle

        public java.lang.String getTitle()
      • setTitle

        public void setTitle​(java.lang.String s)
      • addWhitespaceIfNecessary

        public void addWhitespaceIfNecessary()
      • addLabelAction

        public void addLabelAction​(LabelAction la)
                            throws java.lang.IllegalStateException
        Throws:
        java.lang.IllegalStateException