Examples of org.apache.tika.sax.XHTMLContentHandler

org.apache.tika.sax.XHTMLContentHandler
Content handler decorator that simplifies the task of producing XHTML events for Tika content parsers.

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        metadata.set(Metadata.CONTENT_TYPE, "audio/midi");


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        // MidiSystem expects the stream to support the mark feature
        InputStream buffered = new BufferedInputStream(stream);
        try {
            Sequence sequence = MidiSystem.getSequence(buffered);


            Track[] tracks = sequence.getTracks();
            metadata.set("tracks", String.valueOf(tracks.length));
            // TODO: Use XMPDM.TRACKS?


            Patch[] patches = sequence.getPatchList();
            metadata.set("patches", String.valueOf(patches.length));


            float type = sequence.getDivisionType();
            if (type == Sequence.PPQ) {
                metadata.set("divisionType", "PPQ");
            } else if (type == Sequence.SMPTE_24) {
                metadata.set("divisionType", "SMPTE_24");
            } else if (type == Sequence.SMPTE_25) {
                metadata.set("divisionType", "SMPTE_25");
            } else if (type == Sequence.SMPTE_30) {
                metadata.set("divisionType", "SMPTE_30");
            } else if (type == Sequence.SMPTE_30DROP) {
                metadata.set("divisionType", "SMPTE_30DROP");
            } else if (type == Sequence.SMPTE_24) {
                metadata.set("divisionType", String.valueOf(type));
            }


            for (Track track : tracks) {
                xhtml.startElement("p");
                for (int i = 0; i < track.size(); i++) {
                    MidiMessage message = track.get(i).getMessage();
                    if (message instanceof MetaMessage) {
                        MetaMessage meta = (MetaMessage) message;
                        // Types 1-15 are reserved for text events
                        if (meta.getType() >= 1 && meta.getType() <= 15) {
                            // FIXME: What's the encoding?
                            xhtml.characters(
                                    new String(meta.getData(), "ISO-8859-1"));
                        }
                    }
                }
                xhtml.endElement("p");
            }
        } catch (InvalidMidiDataException ignore) {
            // There is no way to know whether this exception was
            // caused by the document being corrupted or by the format
            // just being unsupported. So we do nothing.
        }


        xhtml.endDocument();
    }

View Full Code Here

     */
    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        POIFSFileSystem filesystem;
        if(stream instanceof TikaInputStream && 
          ((TikaInputStream)stream).getOpenContainer() != null) {
            filesystem = (POIFSFileSystem)((TikaInputStream)stream).getOpenContainer();
        } else {
            filesystem = new POIFSFileSystem(stream);
        }


        // Parse summary entries first, to make metadata available early
        new SummaryExtractor(metadata).parseSummaries(filesystem);


        // Parse remaining document entries
        boolean outlookExtracted = false;
        for (Entry entry : filesystem.getRoot()) {
            POIFSDocumentType type = POIFSDocumentType.detectType(entry);


            if (type!=POIFSDocumentType.UNKNOWN) {
                setType(metadata, type.getType());
            }


            switch (type) {
                case PUBLISHER:
                    PublisherTextExtractor publisherTextExtractor =
                        new PublisherTextExtractor(filesystem);
                    xhtml.element("p", publisherTextExtractor.getText());
                    break;
                case WORDDOCUMENT:
                    new WordExtractor(context).parse(filesystem, xhtml);
                    break;
                case POWERPOINT:
                    PowerPointExtractor powerPointExtractor =
                        new PowerPointExtractor(filesystem);
                    xhtml.element("p", powerPointExtractor.getText(true, true));
                    break;
                case WORKBOOK:
                    Locale locale = context.get(Locale.class, Locale.getDefault());
                    new ExcelExtractor(context).parse(filesystem, xhtml, locale);
                    break;
                case VISIO:
                    VisioTextExtractor visioTextExtractor =
                        new VisioTextExtractor(filesystem);
                    for (String text : visioTextExtractor.getAllText()) {
                        xhtml.element("p", text);
                    }
                    break;
                case OUTLOOK:
                    if (!outlookExtracted) {
                        outlookExtracted = true;


                        OutlookExtractor extractor =
                            new OutlookExtractor(filesystem, context);


                        extractor.parse(xhtml, metadata);
                    }
                    break;
                case ENCRYPTED:
                    EncryptionInfo info = new EncryptionInfo(filesystem);
                    Decryptor d = new Decryptor(info);


                    try {
                        if (!d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
                            throw new TikaException("Unable to process: document is encrypted");
                        }


                        OOXMLParser parser = new OOXMLParser();


                        parser.parse(d.getDataStream(filesystem), new EmbeddedContentHandler(
                                        new BodyContentHandler(xhtml)),
                                        metadata, context);
                    } catch (GeneralSecurityException ex) {
                        throw new TikaException("Unable to process encrypted document", ex);
                    }
            }
        }


        xhtml.endDocument();
    }

View Full Code Here

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        ContentHandler contentHandler;
        String contentType = metadata.get(Metadata.CONTENT_TYPE);
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        if ("application/vnd.apple.keynote".equals(contentType)) {
          contentHandler = new KeynoteContentHandler(xhtml, metadata);
        } else if ("application/vnd.apple.pages".equals(contentType)) {
          contentHandler = new PagesContentHandler(xhtml, metadata);
        } else if ("application/vnd.apple.numbers".equals(contentType)) {
          contentHandler = new NumbersContentHandler(xhtml, metadata);
        } else {
          return;
        }


        xhtml.startDocument();
        context.getSAXParser().parse(
                new CloseShieldInputStream(stream),
                new OfflineContentHandler(contentHandler)
        );
        xhtml.endDocument();
    }

View Full Code Here

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        new ImageMetadataExtractor(metadata).parseJpeg(stream);


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }

View Full Code Here

            int bom = reader.read();
            if (bom != '\ufeff') { // zero-width no-break space
                reader.reset();
            }


            XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();


            xhtml.startElement("p");
            char[] buffer = new char[4096];
            int n = reader.read(buffer);
            while (n != -1) {
                xhtml.characters(buffer, 0, n);
                n = reader.read(buffer);
            }
            xhtml.endElement("p");


            xhtml.endDocument();
        } catch (UnsupportedEncodingException e) {
            throw new TikaException(
                    "Unsupported text encoding: " + encoding, e);
        }
    }

View Full Code Here

            throws IOException, SAXException, TikaException {
        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
            metadata.set(Metadata.CONTENT_TYPE, "application/xml");
        }


        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");


        context.getSAXParser().parse(
                new CloseShieldInputStream(stream),
                new OfflineContentHandler(
                        getContentHandler(handler, metadata)));


        xhtml.endElement("p");
        xhtml.endDocument();
    }

View Full Code Here

        this.extractor = new EmbeddedDocumentExtractor(context);
    }


    public void parse(InputStream stream)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        // At the end we want to close the package/compression stream to
        // release any associated resources, but the underlying document
        // stream should not be closed
        stream = new CloseShieldInputStream(stream);


        // Capture two bytes to determine the packaging/compression format
        if (!stream.markSupported()) {
            stream = new BufferedInputStream(stream);
        }
        stream.mark(2);
        int a = stream.read();
        int b = stream.read();
        stream.reset();


        // Select decompression or unpacking mechanism based on the two bytes
        if (a == 'B' && b == 'Z') {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");
            decompress(new BZip2CompressorInputStream(stream), xhtml);
        } else if (a == 0x1f && b == 0x8b) {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip");
            decompress(new GZIPInputStream(stream), xhtml);
        } else if (a == 'P' && b == 'K') {
            metadata.set(Metadata.CONTENT_TYPE, "application/zip");
            unpack(new ZipArchiveInputStream(stream), xhtml);
        } else if ((a == '0' && b == '7')
                || (a == 0x71 && b == 0xc7)
                || (a == 0xc7 && b == 0x71)) {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-cpio");
            unpack(new CpioArchiveInputStream(stream), xhtml);
        } else if (a == '=' && (b == '<' || b == '!')) {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-archive");
            unpack(new ArArchiveInputStream(stream), xhtml);
        } else {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
            unpack(new TarArchiveInputStream(stream), xhtml);
        }


        xhtml.endDocument();
    }

View Full Code Here


    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler,metadata);
        DefaultHandler dh = new ElementMappingContentHandler(xhtml, MAPPINGS) {


            private final BitSet textNodeStack = new BitSet();


            private int nodeDepth = 0;


            private int completelyFiltered = 0;


            private Stack<String> headingStack = new Stack<String>();


            @Override
            public void characters(char[] ch, int start, int length)
                    throws SAXException {
                // only forward content of tags from text:-namespace
                if (completelyFiltered == 0 && nodeDepth > 0
                        && textNodeStack.get(nodeDepth - 1)) {
                    super.characters(ch,start,length);
                }
            }


            // helper for checking tags which need complete filtering
            // (with sub-tags)
            private boolean needsCompleteFiltering(
                    String namespaceURI, String localName) {
                if (TEXT_NS.equals(namespaceURI)) {
                    return localName.endsWith("-template")
                        || localName.endsWith("-style");
                } else if (TABLE_NS.equals(namespaceURI)) {
                    return "covered-table-cell".equals(localName);
                } else {
                    return false;
                }
            }


            // map the heading level to <hX> HTML tags
            private String getXHTMLHeaderTagName(Attributes atts) {
                String depthStr = atts.getValue(TEXT_NS, "outline-level");
                if (depthStr == null) {
                    return "h1";
                }


                int depth = Integer.parseInt(depthStr);
                if (depth >= 6) {
                    return "h6";
                } else if (depth <= 1) {
                    return "h1";
                } else {
                    return "h" + depth;
                }
            }


            /**
             * Check if a node is a text node
             */
            private boolean isTextNode(String namespaceURI, String localName) {
                if (TEXT_NS.equals(namespaceURI)) {
                    return true;
                }
                if (SVG_NS.equals(namespaceURI)) {
                    return "title".equals(localName) ||
                            "desc".equals(localName);
                }
                return false;
            }


            @Override
            public void startElement(
                    String namespaceURI, String localName, String qName,
                    Attributes atts) throws SAXException {
                // keep track of current node type. If it is a text node,
                // a bit at the current depth ist set in textNodeStack.
                // characters() checks the top bit to determine, if the
                // actual node is a text node to print out nodeDepth contains
                // the depth of the current node and also marks top of stack.
                assert nodeDepth >= 0;


                textNodeStack.set(nodeDepth++, 
                        isTextNode(namespaceURI, localName));
                // filter *all* content of some tags
                assert completelyFiltered >= 0;


                if (needsCompleteFiltering(namespaceURI, localName)) {
                    completelyFiltered++;
                }
                // call next handler if no filtering
                if (completelyFiltered == 0) {
                    // special handling of text:h, that are directly passed
                    // to xhtml handler
                    if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
                        xhtml.startElement(headingStack.push(
                                getXHTMLHeaderTagName(atts)));
                    } else {
                        super.startElement(
                                namespaceURI, localName, qName, atts);
                    }
                }
            }


            @Override
            public void endElement(
                    String namespaceURI, String localName, String qName)
                    throws SAXException {
                // call next handler if no filtering
                if (completelyFiltered == 0) {
                    // special handling of text:h, that are directly passed
                    // to xhtml handler
                    if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
                        xhtml.endElement(headingStack.pop());
                    } else {
                        super.endElement(namespaceURI,localName,qName);
                    }


                    // special handling of tabulators

View Full Code Here

            if (s.startsWith("Unknown tag")) {
                metadata.remove(s);
            }
        }


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }

View Full Code Here

            } catch (IIOException e) {
                throw new TikaException(type + " parse error", e);
            }
        }


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.tika.sax.XHTMLContentHandler

de.innosystec.unrar.tika.RARParser

org.apache.jackrabbit.core.query.lucene.BlockingParser

org.apache.jackrabbit.core.query.pdf.PDF2XHTML

org.apache.jackrabbit.oak.http.HtmlRepresentation

org.apache.tika.fork.ForkTestParser

org.apache.tika.parser.asm.XHTMLClassVisitor

org.apache.tika.parser.audio.AudioParser

org.apache.tika.parser.audio.MidiParser

org.apache.tika.parser.chm.ChmParser

org.apache.tika.parser.dwg.DWGParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.