Package org.apache.tika.sax

Examples of org.apache.tika.sax.XHTMLContentHandler


            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        metadata.set(Metadata.CONTENT_TYPE, "audio/midi");

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        // MidiSystem expects the stream to support the mark feature
        InputStream buffered = new BufferedInputStream(stream);
        try {
            Sequence sequence = MidiSystem.getSequence(buffered);

            Track[] tracks = sequence.getTracks();
            metadata.set("tracks", String.valueOf(tracks.length));
            // TODO: Use XMPDM.TRACKS?

            Patch[] patches = sequence.getPatchList();
            metadata.set("patches", String.valueOf(patches.length));

            float type = sequence.getDivisionType();
            if (type == Sequence.PPQ) {
                metadata.set("divisionType", "PPQ");
            } else if (type == Sequence.SMPTE_24) {
                metadata.set("divisionType", "SMPTE_24");
            } else if (type == Sequence.SMPTE_25) {
                metadata.set("divisionType", "SMPTE_25");
            } else if (type == Sequence.SMPTE_30) {
                metadata.set("divisionType", "SMPTE_30");
            } else if (type == Sequence.SMPTE_30DROP) {
                metadata.set("divisionType", "SMPTE_30DROP");
            } else if (type == Sequence.SMPTE_24) {
                metadata.set("divisionType", String.valueOf(type));
            }

            for (Track track : tracks) {
                xhtml.startElement("p");
                for (int i = 0; i < track.size(); i++) {
                    MidiMessage message = track.get(i).getMessage();
                    if (message instanceof MetaMessage) {
                        MetaMessage meta = (MetaMessage) message;
                        // Types 1-15 are reserved for text events
                        if (meta.getType() >= 1 && meta.getType() <= 15) {
                            // FIXME: What's the encoding?
                            xhtml.characters(
                                    new String(meta.getData(), "ISO-8859-1"));
                        }
                    }
                }
                xhtml.endElement("p");
            }
        } catch (InvalidMidiDataException ignore) {
            // There is no way to know whether this exception was
            // caused by the document being corrupted or by the format
            // just being unsupported. So we do nothing.
        }

        xhtml.endDocument();
    }
View Full Code Here


     */
    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        POIFSFileSystem filesystem;
        if(stream instanceof TikaInputStream &&
          ((TikaInputStream)stream).getOpenContainer() != null) {
            filesystem = (POIFSFileSystem)((TikaInputStream)stream).getOpenContainer();
        } else {
            filesystem = new POIFSFileSystem(stream);
        }

        // Parse summary entries first, to make metadata available early
        new SummaryExtractor(metadata).parseSummaries(filesystem);

        // Parse remaining document entries
        boolean outlookExtracted = false;
        for (Entry entry : filesystem.getRoot()) {
            POIFSDocumentType type = POIFSDocumentType.detectType(entry);

            if (type!=POIFSDocumentType.UNKNOWN) {
                setType(metadata, type.getType());
            }

            switch (type) {
                case PUBLISHER:
                    PublisherTextExtractor publisherTextExtractor =
                        new PublisherTextExtractor(filesystem);
                    xhtml.element("p", publisherTextExtractor.getText());
                    break;
                case WORDDOCUMENT:
                    new WordExtractor(context).parse(filesystem, xhtml);
                    break;
                case POWERPOINT:
                    PowerPointExtractor powerPointExtractor =
                        new PowerPointExtractor(filesystem);
                    xhtml.element("p", powerPointExtractor.getText(true, true));
                    break;
                case WORKBOOK:
                    Locale locale = context.get(Locale.class, Locale.getDefault());
                    new ExcelExtractor(context).parse(filesystem, xhtml, locale);
                    break;
                case VISIO:
                    VisioTextExtractor visioTextExtractor =
                        new VisioTextExtractor(filesystem);
                    for (String text : visioTextExtractor.getAllText()) {
                        xhtml.element("p", text);
                    }
                    break;
                case OUTLOOK:
                    if (!outlookExtracted) {
                        outlookExtracted = true;

                        OutlookExtractor extractor =
                            new OutlookExtractor(filesystem, context);

                        extractor.parse(xhtml, metadata);
                    }
                    break;
                case ENCRYPTED:
                    EncryptionInfo info = new EncryptionInfo(filesystem);
                    Decryptor d = new Decryptor(info);

                    try {
                        if (!d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
                            throw new TikaException("Unable to process: document is encrypted");
                        }

                        OOXMLParser parser = new OOXMLParser();

                        parser.parse(d.getDataStream(filesystem), new EmbeddedContentHandler(
                                        new BodyContentHandler(xhtml)),
                                        metadata, context);
                    } catch (GeneralSecurityException ex) {
                        throw new TikaException("Unable to process encrypted document", ex);
                    }
            }
        }

        xhtml.endDocument();
    }
View Full Code Here

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        ContentHandler contentHandler;
        String contentType = metadata.get(Metadata.CONTENT_TYPE);
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        if ("application/vnd.apple.keynote".equals(contentType)) {
          contentHandler = new KeynoteContentHandler(xhtml, metadata);
        } else if ("application/vnd.apple.pages".equals(contentType)) {
          contentHandler = new PagesContentHandler(xhtml, metadata);
        } else if ("application/vnd.apple.numbers".equals(contentType)) {
          contentHandler = new NumbersContentHandler(xhtml, metadata);
        } else {
          return;
        }

        xhtml.startDocument();
        context.getSAXParser().parse(
                new CloseShieldInputStream(stream),
                new OfflineContentHandler(contentHandler)
        );
        xhtml.endDocument();
    }
View Full Code Here

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        new ImageMetadataExtractor(metadata).parseJpeg(stream);

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }
View Full Code Here

            int bom = reader.read();
            if (bom != '\ufeff') { // zero-width no-break space
                reader.reset();
            }

            XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();

            xhtml.startElement("p");
            char[] buffer = new char[4096];
            int n = reader.read(buffer);
            while (n != -1) {
                xhtml.characters(buffer, 0, n);
                n = reader.read(buffer);
            }
            xhtml.endElement("p");

            xhtml.endDocument();
        } catch (UnsupportedEncodingException e) {
            throw new TikaException(
                    "Unsupported text encoding: " + encoding, e);
        }
    }
View Full Code Here

            throws IOException, SAXException, TikaException {
        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
            metadata.set(Metadata.CONTENT_TYPE, "application/xml");
        }

        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");

        context.getSAXParser().parse(
                new CloseShieldInputStream(stream),
                new OfflineContentHandler(
                        getContentHandler(handler, metadata)));

        xhtml.endElement("p");
        xhtml.endDocument();
    }
View Full Code Here

        this.extractor = new EmbeddedDocumentExtractor(context);
    }

    public void parse(InputStream stream)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        // At the end we want to close the package/compression stream to
        // release any associated resources, but the underlying document
        // stream should not be closed
        stream = new CloseShieldInputStream(stream);

        // Capture two bytes to determine the packaging/compression format
        if (!stream.markSupported()) {
            stream = new BufferedInputStream(stream);
        }
        stream.mark(2);
        int a = stream.read();
        int b = stream.read();
        stream.reset();

        // Select decompression or unpacking mechanism based on the two bytes
        if (a == 'B' && b == 'Z') {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");
            decompress(new BZip2CompressorInputStream(stream), xhtml);
        } else if (a == 0x1f && b == 0x8b) {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip");
            decompress(new GZIPInputStream(stream), xhtml);
        } else if (a == 'P' && b == 'K') {
            metadata.set(Metadata.CONTENT_TYPE, "application/zip");
            unpack(new ZipArchiveInputStream(stream), xhtml);
        } else if ((a == '0' && b == '7')
                || (a == 0x71 && b == 0xc7)
                || (a == 0xc7 && b == 0x71)) {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-cpio");
            unpack(new CpioArchiveInputStream(stream), xhtml);
        } else if (a == '=' && (b == '<' || b == '!')) {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-archive");
            unpack(new ArArchiveInputStream(stream), xhtml);
        } else {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
            unpack(new TarArchiveInputStream(stream), xhtml);
        }

        xhtml.endDocument();
    }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler,metadata);
        DefaultHandler dh = new ElementMappingContentHandler(xhtml, MAPPINGS) {

            private final BitSet textNodeStack = new BitSet();

            private int nodeDepth = 0;

            private int completelyFiltered = 0;

            private Stack<String> headingStack = new Stack<String>();

            @Override
            public void characters(char[] ch, int start, int length)
                    throws SAXException {
                // only forward content of tags from text:-namespace
                if (completelyFiltered == 0 && nodeDepth > 0
                        && textNodeStack.get(nodeDepth - 1)) {
                    super.characters(ch,start,length);
                }
            }

            // helper for checking tags which need complete filtering
            // (with sub-tags)
            private boolean needsCompleteFiltering(
                    String namespaceURI, String localName) {
                if (TEXT_NS.equals(namespaceURI)) {
                    return localName.endsWith("-template")
                        || localName.endsWith("-style");
                } else if (TABLE_NS.equals(namespaceURI)) {
                    return "covered-table-cell".equals(localName);
                } else {
                    return false;
                }
            }

            // map the heading level to <hX> HTML tags
            private String getXHTMLHeaderTagName(Attributes atts) {
                String depthStr = atts.getValue(TEXT_NS, "outline-level");
                if (depthStr == null) {
                    return "h1";
                }

                int depth = Integer.parseInt(depthStr);
                if (depth >= 6) {
                    return "h6";
                } else if (depth <= 1) {
                    return "h1";
                } else {
                    return "h" + depth;
                }
            }

            /**
             * Check if a node is a text node
             */
            private boolean isTextNode(String namespaceURI, String localName) {
                if (TEXT_NS.equals(namespaceURI)) {
                    return true;
                }
                if (SVG_NS.equals(namespaceURI)) {
                    return "title".equals(localName) ||
                            "desc".equals(localName);
                }
                return false;
            }

            @Override
            public void startElement(
                    String namespaceURI, String localName, String qName,
                    Attributes atts) throws SAXException {
                // keep track of current node type. If it is a text node,
                // a bit at the current depth ist set in textNodeStack.
                // characters() checks the top bit to determine, if the
                // actual node is a text node to print out nodeDepth contains
                // the depth of the current node and also marks top of stack.
                assert nodeDepth >= 0;

                textNodeStack.set(nodeDepth++,
                        isTextNode(namespaceURI, localName));
                // filter *all* content of some tags
                assert completelyFiltered >= 0;

                if (needsCompleteFiltering(namespaceURI, localName)) {
                    completelyFiltered++;
                }
                // call next handler if no filtering
                if (completelyFiltered == 0) {
                    // special handling of text:h, that are directly passed
                    // to xhtml handler
                    if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
                        xhtml.startElement(headingStack.push(
                                getXHTMLHeaderTagName(atts)));
                    } else {
                        super.startElement(
                                namespaceURI, localName, qName, atts);
                    }
                }
            }

            @Override
            public void endElement(
                    String namespaceURI, String localName, String qName)
                    throws SAXException {
                // call next handler if no filtering
                if (completelyFiltered == 0) {
                    // special handling of text:h, that are directly passed
                    // to xhtml handler
                    if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
                        xhtml.endElement(headingStack.pop());
                    } else {
                        super.endElement(namespaceURI,localName,qName);
                    }

                    // special handling of tabulators
View Full Code Here

            if (s.startsWith("Unknown tag")) {
                metadata.remove(s);
            }
        }

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }
View Full Code Here

            } catch (IIOException e) {
                throw new TikaException(type + " parse error", e);
            }
        }

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.XHTMLContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.