Examples of ParseContext


Examples of org.apache.tika.parser.ParseContext

        _contentExtractor = contentExtractor;
        _linkExtractor = linkExtractor;

        if (includeMarkup) {
            _parseContext = new ParseContext();
            _parseContext.set(HtmlMapper.class, FixedIdentityHtmlMapper.INSTANCE);
        }
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

     * aren't part of the default set.
     *
     * @return
     */
    private ParseContext makeParseContext() {
        ParseContext result = new ParseContext();

        Set<String> validTags = _linkExtractor.getLinkTags();
        HtmlMapper defaultMapper = DefaultHtmlMapper.INSTANCE;
        for (String tag : validTags) {
            if (defaultMapper.mapSafeElement(tag) == null) {
                result.set(HtmlMapper.class, new CustomHtmlMapper(validTags, _linkExtractor.getLinkAttributeTypes()));
                break;
            }
        }
       
        return result;
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

    InputStream input = new FileInputStream(
            new File("src/test/resources/pdfBox-sample.pdf"));//<co id="tika.is"/>
    ContentHandler textHandler = new BodyContentHandler();//<co id="tika.handler"/>
    Metadata metadata = new Metadata();//<co id="tika.metadata"/>
    Parser parser = new AutoDetectParser();//<co id="tika.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, textHandler, metadata, context);//<co id="tika.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));//<co id="tika.title"/>
    System.out.println("Body: " + textHandler.toString());//<co id="tika.body"/>
    /*
<calloutlist>
 
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

    ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/>
    LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/>
    ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/>
    Metadata metadata = new Metadata();//<co id="html.store"/>
    Parser parser = new HtmlParser();//<co id="html.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, handler, metadata, context);//<co id="html.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    System.out.println("Body: " + text.toString());
    System.out.println("Links: " + links.getLinks());
    /*
 
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

    private String encoding = null;

    private boolean pipeMode = true;

    public TikaCLI() throws TransformerConfigurationException {
        context = new ParseContext();
        parser = new AutoDetectParser();
        context.set(Parser.class, parser);
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

     * @deprecated This method will be removed in Apache Tika 1.0.
     */
    public void parse(
            InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        parse(stream, handler, metadata, new ParseContext());
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

        xhtml.endDocument();
    }

    public void parse(InputStream stream, ContentHandler handler,
            Metadata metadata) throws IOException, SAXException, TikaException {
        parse(stream, handler, metadata, new ParseContext());
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

        StringWriter writer = new StringWriter();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("UTF-8")),
                new WriteOutContentHandler(writer),
                metadata,
                new ParseContext());
        String content = writer.toString();

        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
        assertEquals("en", metadata.get(Metadata.LANGUAGE));
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        parser.parse(
                new ByteArrayInputStream(text.getBytes("UTF-8")),
                handler, metadata, new ParseContext());
        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

        assertTrue(handler.toString().contains(text));
    }
View Full Code Here

Examples of org.apache.tika.parser.ParseContext

    public void testEmptyText() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        parser.parse(
                new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("\n", handler.toString());
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.