Examples of de.l3s.boilerpipe.document.TextDocument

de.l3s.boilerpipe.document.TextDocument

    /**
     * getContent returns the boilerpipe extracted text.
     */
    @Override
    public String getContent() {
        TextDocument textDocument = _bpContentHandler.getTextDocument();
        return textDocument.getText(true, false);
    }

View Full Code Here


    @Override
    public void endDocument() throws SAXException {
        super.endDocument();


        TextDocument td = toTextDocument();
        try {
            extractor.process(td);
        } catch (BoilerpipeProcessingException e) {
            throw new SAXException(e);
        }
        
        Attributes emptyAttrs = new AttributesImpl();


        delegate.startDocument();
        delegate.startPrefixMapping("", XHTMLContentHandler.XHTML);


        delegate.startElement(XHTMLContentHandler.XHTML, "html", "html", emptyAttrs);
        delegate.startElement(XHTMLContentHandler.XHTML, "head", "head", emptyAttrs);
        delegate.startElement(XHTMLContentHandler.XHTML, "title", "title", emptyAttrs);
        
        if (td.getTitle() != null) {
            char[] titleChars = td.getTitle().toCharArray();
            delegate.characters(titleChars, 0, titleChars.length);
            delegate.ignorableWhitespace(NL, 0, NL.length);
        }
        
        delegate.endElement(XHTMLContentHandler.XHTML, "title", "title");
        delegate.endElement(XHTMLContentHandler.XHTML, "head", "head");
        
        delegate.startElement(XHTMLContentHandler.XHTML, "body", "body", emptyAttrs);


        for (TextBlock block : td.getTextBlocks()) {
            if (block.isContent()) {
                delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
                char[] chars = block.getText().toCharArray();
                delegate.characters(chars, 0, chars.length);
                delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");

View Full Code Here

                if(cache.contains(url.toString())){
                    text = cache.get(url.toString());
                    logger.debug("  Fetched from cache:"+url.toString());
                } else {
                    HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
                    TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
                    text = ArticleExtractor.INSTANCE.getText(doc);
                    cache.put(url.toString(), text);
                    logger.debug("Fetched from web:"+url.toString());
                }
                if(text.length()<100){

View Full Code Here

    
    @Override
    public void endDocument() throws SAXException {
        super.endDocument();


        TextDocument td = toTextDocument();
        try {
            extractor.process(td);
        } catch (BoilerpipeProcessingException e) {
            throw new SAXException(e);
        }
        
        Attributes emptyAttrs = new AttributesImpl();


        // At this point we have all the information we need to either emit N paragraphs
        // of plain text (if not including markup), or we have to replay our recorded elements
        // and only emit character runs that passed the boilerpipe filters.
        if (includeMarkup) {
            BitSet validCharacterRuns = new BitSet();
            for (TextBlock block : td.getTextBlocks()) {
                if (block.isContent()) {
                    BitSet bs = block.getContainedTextElements();
                    if (bs != null) {
                        validCharacterRuns.or(bs);
                    }
                }
            }
            
            // Now have bits set for all valid character runs. Replay our recorded elements,
            // but only emit character runs flagged as valid.
            int curCharsIndex = headerCharOffset;
            for (RecordedElement element : elements) {
                switch (element.getElementType()) {
                    case START:
                        delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
                        // Fall through
                        
                    case CONTINUE:
                        // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
                        // we have to follow suit.
                        for (char[] chars : element.getCharacters()) {
                            curCharsIndex++;
                            
                            if (validCharacterRuns.get(curCharsIndex)) {
                                delegate.characters(chars, 0, chars.length);
                            }
                        }
                        break;
                        
                    case END:
                        delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
                        break;
                        
                    default:
                        throw new RuntimeException("Unhandled element type: " + element.getElementType());
                }
                
                
            }
        } else {
            for (TextBlock block : td.getTextBlocks()) {
                if (block.isContent()) {
                    delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
                    char[] chars = block.getText().toCharArray();
                    delegate.characters(chars, 0, chars.length);
                    delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");

View Full Code Here

TOP

Related Classes of de.l3s.boilerpipe.document.TextDocument

bixo.parser.BoilerpipeContentExtractor

org.apache.tika.parser.html.BoilerpipeContentHandler

org.mediameter.cliff.test.places.focus.GdeltFocusChecker

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.