Examples of org.apache.tika.sax.BasicContentHandlerFactory

org.apache.tika.sax.BasicContentHandlerFactory
Basic factory for creating common types of ContentHandlers

    //TIKA-1010 test regular (not "embedded") images/picts
    public void testRegularImages() throws Exception {
        Parser base = new AutoDetectParser();
        ParseContext ctx = new ParseContext();
        RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        ctx.set(org.apache.tika.parser.Parser.class, parser);
        TikaInputStream tis = null;
        ContentHandler handler = new BodyContentHandler();
        Metadata rootMetadata = new Metadata();
        rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");

View Full Code Here

        } else if (type.equals(TEXT_MAIN)) {
            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
        } else if (type.equals(METADATA)) {
            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
        }
        return new BasicContentHandlerFactory(handlerType, -1);
    }

View Full Code Here

                    "Try the app with command line argument of -J."
            );
        }
        if (isReset) {
            RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser,
                    new BasicContentHandlerFactory(
                            BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
            wrapper.parse(input, null, new Metadata(), new ParseContext());
            StringWriter jsonBuffer = new StringWriter();
            JsonMetadataList.setPrettyPrinting(true);
            JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);

View Full Code Here

        String xml = getXML("/testPDF_childAttachments.pdf").xml;
        //"regressiveness" exists only in Unit10.doc not in the container pdf document
        assertTrue(xml.contains("regressiveness"));


        RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        TikaInputStream tis = null;
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        config.setExtractInlineImages(true);
        config.setExtractUniqueInlineImagesOnly(false);

View Full Code Here

        config.setExtractUniqueInlineImagesOnly(false);


        Parser defaultParser = new AutoDetectParser();


        RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        ParseContext context = new ParseContext();
        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
        context.set(org.apache.tika.parser.Parser.class, p);
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler(-1);

View Full Code Here

    @Test
    public void testInlineConfig() throws Exception {
        
        Parser defaultParser = new AutoDetectParser();
        RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        ParseContext context = new ParseContext();
        context.set(org.apache.tika.parser.Parser.class, p);
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler(-1);
        String path = "/test-documents/testPDF_childAttachments.pdf";

View Full Code Here

    public void testEmbeddedFileNameExtraction() throws Exception {
        InputStream is = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDF_multiFormatEmbFiles.pdf");
        RecursiveParserWrapper p = new RecursiveParserWrapper(
                new AutoDetectParser(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        Metadata m = new Metadata();
        ParseContext c = new ParseContext();
        c.set(org.apache.tika.parser.Parser.class, p);
        ContentHandler h = new BodyContentHandler();
        p.parse(is, h, m, c);

View Full Code Here

    public void testOSSpecificEmbeddedFileExtraction() throws Exception {
        InputStream is = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDF_multiFormatEmbFiles.pdf");
        RecursiveParserWrapper p = new RecursiveParserWrapper(
                new AutoDetectParser(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
        Metadata m = new Metadata();
        ParseContext c = new ParseContext();
        c.set(org.apache.tika.parser.Parser.class, p);
        ContentHandler h = new BodyContentHandler();
        p.parse(is, h, m, c);

View Full Code Here

public class RecursiveParserWrapperTest {


    @Test
    public void testBasicXML() throws Exception {
        List<Metadata> list = getMetadata(new Metadata(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
        Metadata container = list.get(0);
        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
        //not much differentiates html from xml in this test file
        assertTrue(content.indexOf("<p class=\"header\" />") > -1);
    }

View Full Code Here

    }


    @Test
    public void testBasicHTML() throws Exception {
        List<Metadata> list = getMetadata(new Metadata(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
        Metadata container = list.get(0);
        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
        //not much differentiates html from xml in this test file
        assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
    }

View Full Code Here

0 1

TOP

Related Classes of org.apache.tika.sax.BasicContentHandlerFactory

org.apache.tika.cli.TikaCLI

org.apache.tika.gui.TikaGUI

org.apache.tika.parser.pdf.PDFParserTest

org.apache.tika.parser.RecursiveParserWrapperTest

org.apache.tika.parser.rtf.RTFParserTest

org.xml.sax.helpers.DefaultHandler

java.io.OutputStreamWriter

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.