Examples of org.apache.tika.sax.EndDocumentShieldingContentHandler

org.apache.tika.sax.EndDocumentShieldingContentHandler
A wrapper around a {@link ContentHandler} which will ignore normalSAX calls to {@link #endDocument()}, and only fire them later. This is typically used to ensure that we can output the metadata before ending the document

            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
       
        // As we don't know which of the metadata or the content
        //  we'll hit first, catch the endDocument call initially
        EndDocumentShieldingContentHandler handler = 
          new EndDocumentShieldingContentHandler(baseHandler);
       
        // Process the file in turn
        ZipInputStream zip = new ZipInputStream(stream);
        ZipEntry entry = zip.getNextEntry();
        while (entry != null) {
            if (entry.getName().equals("mimetype")) {
                String type = IOUtils.toString(zip, "UTF-8");
                metadata.set(Metadata.CONTENT_TYPE, type);
            } else if (entry.getName().equals("meta.xml")) {
                meta.parse(zip, new DefaultHandler(), metadata, context);
            } else if (entry.getName().endsWith("content.xml")) {
                content.parse(zip, handler, metadata, context);
            }
            entry = zip.getNextEntry();
        }
        
        // Only now call the end document
        if(handler.getEndDocumentWasCalled()) {
           handler.reallyEndDocument();
        }
    }

View Full Code Here

                extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
            }
            
            // We need to get the content first, but not end 
            //  the document just yet
            EndDocumentShieldingContentHandler handler = 
               new EndDocumentShieldingContentHandler(baseHandler);
            extractor.getXHTML(handler, metadata, context);


            // Now we can get the metadata
            extractor.getMetadataExtractor().extract(metadata);
            
            // Then finish up
            handler.reallyEndDocument();
        } catch (IllegalArgumentException e) {
            if (e.getMessage().startsWith("No supported documents found")) {
                throw new TikaException(
                        "TIKA-418: RuntimeException while getting content"
                        + " for thmx and xps file types", e);

View Full Code Here

        // Prepare to handle the content
        XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);


        // As we don't know which of the metadata or the content
        //  we'll hit first, catch the endDocument call initially
        EndDocumentShieldingContentHandler handler = 
          new EndDocumentShieldingContentHandler(xhtml);
        
        // If we can, process the metadata first, then the
        //  rest of the file afterwards
        // Only possible to guarantee that when opened from a file not a stream
        ZipEntry entry = null;
        if (zipFile != null) {
            entry = zipFile.getEntry(META_NAME);
            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);


            Enumeration<? extends ZipEntry> entries = zipFile.entries();
            while (entries.hasMoreElements()) {
                entry = entries.nextElement();
                if (! META_NAME.equals(entry.getName())) {
                    handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
                }
            }
            zipFile.close();
        } else {
            do {
                entry = zipStream.getNextEntry();
                handleZipEntry(entry, zipStream, metadata, context, handler);
            } while (entry != null);
            zipStream.close();
        }
        
        // Only now call the end document
        if(handler.getEndDocumentWasCalled()) {
           handler.reallyEndDocument();
        }
    }

View Full Code Here

                extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
            }
            
            // We need to get the content first, but not end 
            //  the document just yet
            EndDocumentShieldingContentHandler handler = 
               new EndDocumentShieldingContentHandler(baseHandler);
            extractor.getXHTML(handler, metadata, context);


            // Now we can get the metadata
            extractor.getMetadataExtractor().extract(metadata);
            
            // Then finish up
            handler.reallyEndDocument();
        } catch (IllegalArgumentException e) {
            if (e.getMessage().startsWith("No supported documents found")) {
                throw new TikaException(
                        "TIKA-418: RuntimeException while getting content"
                        + " for thmx and xps file types", e);

View Full Code Here


        XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);


        // As we don't know which of the metadata or the content
        //  we'll hit first, catch the endDocument call initially
        EndDocumentShieldingContentHandler handler = 
          new EndDocumentShieldingContentHandler(xhtml);


        // Process the file in turn
        ZipInputStream zip = new ZipInputStream(stream);
        ZipEntry entry = zip.getNextEntry();
        while (entry != null) {
            if (entry.getName().equals("mimetype")) {
                String type = IOUtils.toString(zip, "UTF-8");
                metadata.set(Metadata.CONTENT_TYPE, type);
            } else if (entry.getName().equals("meta.xml")) {
                meta.parse(zip, new DefaultHandler(), metadata, context);
            } else if (entry.getName().endsWith("content.xml")) {
                if (content instanceof OpenDocumentContentParser) {
                    ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
                } else {
                    // Foreign content parser was set:
                    content.parse(zip, handler, metadata, context);
                }
            } else if (entry.getName().endsWith("styles.xml")) {
                if (content instanceof OpenDocumentContentParser) {
                    ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
                } else {
                    // Foreign content parser was set:
                    content.parse(zip, handler, metadata, context);
                }
            }
            entry = zip.getNextEntry();
        }
        
        // Only now call the end document
        if(handler.getEndDocumentWasCalled()) {
           handler.reallyEndDocument();
        }
    }

View Full Code Here

                extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
            }
            
            // We need to get the content first, but not end 
            //  the document just yet
            EndDocumentShieldingContentHandler handler = 
               new EndDocumentShieldingContentHandler(baseHandler);
            extractor.getXHTML(handler, metadata, context);


            // Now we can get the metadata
            extractor.getMetadataExtractor().extract(metadata);
            
            // Then finish up
            handler.reallyEndDocument();
        } catch (IllegalArgumentException e) {
            if (e.getMessage().startsWith("No supported documents found")) {
                throw new TikaException(
                        "TIKA-418: RuntimeException while getting content"
                        + " for thmx and xps file types", e);

View Full Code Here

        // Prepare to handle the content
        XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);


        // As we don't know which of the metadata or the content
        //  we'll hit first, catch the endDocument call initially
        EndDocumentShieldingContentHandler handler = 
          new EndDocumentShieldingContentHandler(xhtml);
        
        // If we can, process the metadata first, then the
        //  rest of the file afterwards
        // Only possible to guarantee that when opened from a file not a stream
        ZipEntry entry = null;
        if (zipFile != null) {
            entry = zipFile.getEntry(META_NAME);
            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);


            Enumeration<? extends ZipEntry> entries = zipFile.entries();
            while (entries.hasMoreElements()) {
                entry = entries.nextElement();
                if (! META_NAME.equals(entry.getName())) {
                    handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
                }
            }
            zipFile.close();
        } else {
            do {
                entry = zipStream.getNextEntry();
                handleZipEntry(entry, zipStream, metadata, context, handler);
            } while (entry != null);
            zipStream.close();
        }
        
        // Only now call the end document
        if(handler.getEndDocumentWasCalled()) {
           handler.reallyEndDocument();
        }
    }

View Full Code Here

TOP

Related Classes of org.apache.tika.sax.EndDocumentShieldingContentHandler

org.apache.tika.parser.microsoft.ooxml.OOXMLExtractorFactory

org.apache.tika.parser.odf.OpenDocumentParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.