Examples of org.apache.tika.extractor.ContainerExtractor

org.apache.tika.extractor.ContainerExtractor
Tika container extractor interface. Container Extractors provide access to the embedded resources within container formats such as .zip and .doc

     *       -> powerpoint
     *       -> excel
     *           -> image
     */
    public void testEmbeddedOfficeFiles() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
       TrackingHandler handler;
       
       
       // Excel with a word doc and a powerpoint doc, both of which have images in them
       // Without recursion, should see both documents + the images

View Full Code Here

    @Test
    public void testBinControlWord() throws Exception {
        ByteCopyingHandler embHandler = new ByteCopyingHandler();
        TikaInputStream tis = null;
        try {
            ContainerExtractor ex = new ParserContainerExtractor();
            tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"));
            assertEquals(true, ex.isSupported(tis));
            ex.extract(tis, ex, embHandler);            
        } finally {
            tis.close();
        }
        assertEquals(1, embHandler.bytes.size());

View Full Code Here

        trueTypes.add("image/jpeg");
        
        TrackingHandler tracker = new TrackingHandler(skipTypes);
        TikaInputStream tis = null;
        try {
            ContainerExtractor ex = new ParserContainerExtractor();
            tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"));
            assertEquals(true, ex.isSupported(tis));
            ex.extract(tis, ex, tracker);


        } finally {
            tis.close();
        }


        assertEquals(trueNames.size(), tracker.filenames.size());
        assertEquals(trueTypes.size(), tracker.mediaTypes.size());
        for (int i = 0; i < tracker.filenames.size(); i++) {
            String expectedName = trueNames.get(i);
            if (expectedName == null) {
                assertNull(tracker.filenames.get(i));
            } else {
                assertNotNull(tracker.filenames.get(i));
                //necessary to getName() because MSOffice extractor includes
                //directory: _1457338524/HW.txt
                assertEquals("filename equals ", 
                        expectedName, FilenameUtils.getName(tracker.filenames.get(i)));
            }
            assertEquals(trueTypes.get(i), tracker.mediaTypes.get(i).toString());
        }
        
        tracker = new TrackingHandler();
        tis = null;
        try {
            ContainerExtractor ex = new ParserContainerExtractor();
            tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"));
            assertEquals(true, ex.isSupported(tis));
            ex.extract(tis, ex, tracker);


        } finally {
            tis.close();
        }
        assertEquals(47, tracker.filenames.size());

View Full Code Here

        skipTypes.add(MediaType.parse("application/x-msmetafile"));


        TrackingHandler tracker = new TrackingHandler(skipTypes);
        TikaInputStream tis = null;
        try {
            ContainerExtractor ex = new ParserContainerExtractor();
            tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"));
            assertEquals(true, ex.isSupported(tis));
            ex.extract(tis, ex, tracker);


        } finally {
            tis.close();
        }
        //should gracefully skip link and not throw NPE, IOEx, etc
        assertEquals(0, tracker.filenames.size());


        tracker = new TrackingHandler();
        tis = null;
        try {
            ContainerExtractor ex = new ParserContainerExtractor();
            tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"));
            assertEquals(true, ex.isSupported(tis));
            ex.extract(tis, ex, tracker);
        } finally {
            tis.close();
        }
        //should gracefully skip link and not throw NPE, IOEx, etc
        assertEquals(2, tracker.filenames.size());

View Full Code Here

    /**
     * For office files which don't have anything embedded in them
     */
    @Test
    public void testWithoutEmbedded() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
       
       String[] files = new String[] {
             "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
             "testVISIO.vsd", "test-outlook.msg"
       };

View Full Code Here

     * Office files with embedded images, but no other
     *  office files in them
     */
    @Test
    public void testEmbeddedImages() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
       TrackingHandler handler;
       
       // Excel with 1 image
       handler = process("testEXCEL_1img.xls", extractor, false);
       assertEquals(1, handler.filenames.size());

View Full Code Here

     *       -> excel
     *           -> image
     */
    @Test
    public void testEmbeddedOfficeFiles() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
       TrackingHandler handler;
       
       
       // Excel with a word doc and a powerpoint doc, both of which have images in them
       // Without recursion, should see both documents + the images

View Full Code Here

       assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
    }


    @Test
    public void testEmbeddedOfficeFilesXML() throws Exception {
        ContainerExtractor extractor = new ParserContainerExtractor();
        TrackingHandler handler;


        handler = process("EmbeddedDocument.docx", extractor, false);
        assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
        assertEquals(2, handler.filenames.size());

View Full Code Here

        assertEquals(2, handler.filenames.size());
    }


    @Test
    public void testPowerpointImages() throws Exception {
        ContainerExtractor extractor = new ParserContainerExtractor();
        TrackingHandler handler;


        handler = process("pictures.ppt", extractor, false);
        assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
        assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));

View Full Code Here


    @Test
    public void testEmbedded() throws Exception {
        InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2");
        try {
            ContainerExtractor extractor = new ParserContainerExtractor();
            TikaInputStream stream = TikaInputStream.get(input);


            assertEquals(true, extractor.isSupported(stream));


            // Process it
            TrackingHandler handler = new TrackingHandler();
            extractor.extract(stream, null, handler);


            assertEquals(2, handler.filenames.size());
        } finally {
            input.close();
        }

View Full Code Here

0 1 2 3 4 5

TOP

Related Classes of org.apache.tika.extractor.ContainerExtractor

org.apache.tika.parser.microsoft.POIContainerExtractionTest

org.apache.tika.parser.microsoft.TNEFParserTest

org.apache.tika.parser.pdf.PDFParserTest

org.apache.tika.parser.rtf.RTFParserTest

org.apache.tika.parser.xml.FictionBookParserTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.