Examples of XHTMLContentHandler


Examples of org.apache.tika.sax.XHTMLContentHandler

            throws IOException, SAXException, TikaException {
       
        Reader reader = Utils.getUTF8Reader(stream, metadata);
        metadata.set(Metadata.CONTENT_TYPE, "text/plain");

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");
        char[] buffer = new char[4096];
        for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
            xhtml.characters(buffer, 0, n);
        }
        xhtml.endElement("p");
        xhtml.endDocument();
    }
View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

    private final XHTMLContentHandler handler;

    private PDF2XHTML(ContentHandler handler, Metadata metadata)
            throws IOException {
        this.handler = new XHTMLContentHandler(handler, metadata);
    }
View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

        xp.extractContent(xmlDoc, "nbPage", "//meta:document-statistic/@meta:page-count", metadata);
        xp.extractContent(xmlDoc, "nbPara", "//meta:document-statistic/@meta:paragraph-count", metadata);
        xp.extractContent(xmlDoc, "nbWord", "//meta:document-statistic/@meta:word-count", metadata);
        xp.extractContent(xmlDoc, "nbcharacter", "//meta:document-statistic/@meta:character-count", metadata);

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");
        xp.concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
        xhtml.endElement("p");
        xhtml.endDocument();
    }
View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        new JpegExtractor(metadata).parse(stream);

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }
View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

            } catch (IIOException e) {
                throw new TikaException(type + " parse error", e);
            }
        }

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }
View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

            int bom = reader.read();
            if (bom != '\ufeff') { // zero-width no-break space
                reader.reset();
            }

            XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();

            xhtml.startElement("p");
            char[] buffer = new char[4096];
            int n = reader.read(buffer);
            while (n != -1) {
                xhtml.characters(buffer, 0, n);
                n = reader.read(buffer);
            }
            xhtml.endElement("p");

            xhtml.endDocument();
        } catch (UnsupportedEncodingException e) {
            throw new TikaException(
                    "Unsupported text encoding: " + encoding, e);
        }
    }
View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

            throws IOException, SAXException, TikaException {
        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
            metadata.set(Metadata.CONTENT_TYPE, "application/xml");
        }

        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");

        getSAXParser(context).parse(
                new CloseShieldInputStream(stream),
                new OfflineContentHandler(
                        getContentHandler(handler, metadata)));

        xhtml.endElement("p");
        xhtml.endDocument();
    }
View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

        this.parser = context.get(Parser.class, EmptyParser.INSTANCE);
    }

    public void parse(InputStream stream)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        // At the end we want to close the package/compression stream to
        // release any associated resources, but the underlying document
        // stream should not be closed
        stream = new CloseShieldInputStream(stream);

        // Capture two bytes to determine the packaging/compression format
        if (!stream.markSupported()) {
            stream = new BufferedInputStream(stream);
        }
        stream.mark(2);
        int a = stream.read();
        int b = stream.read();
        stream.reset();

        // Select decompression or unpacking mechanism based on the two bytes
        if (a == 'B' && b == 'Z') {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");
            decompress(new BZip2CompressorInputStream(stream), xhtml);
        } else if (a == 0x1f && b == 0x8b) {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip");
            decompress(new GZIPInputStream(stream), xhtml);
        } else if (a == 'P' && b == 'K') {
            metadata.set(Metadata.CONTENT_TYPE, "application/zip");
            unpack(new ZipArchiveInputStream(stream), xhtml);
        } else if ((a == '0' && b == '7')
                || (a == 0x71 && b == 0xc7)
                || (a == 0xc7 && b == 0x71)) {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-cpio");
            unpack(new CpioArchiveInputStream(stream), xhtml);
        } else if (a == '=' && (b == '<' || b == '!')) {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-archive");
            unpack(new ArArchiveInputStream(stream), xhtml);
        } else {
            metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
            unpack(new TarArchiveInputStream(stream), xhtml);
        }

        xhtml.endDocument();
    }
View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

     */
    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        POIFSFileSystem filesystem = new POIFSFileSystem(stream);

        // Parse summary entries first, to make metadata available early
        new SummaryExtractor(metadata).parseSummaries(filesystem);

        // Parse remaining document entries
        boolean outlookExtracted = false;
        Iterator<?> entries = filesystem.getRoot().getEntries();
        while (entries.hasNext()) {
            Entry entry = (Entry) entries.next();
            String name = entry.getName();
            if (entry instanceof DirectoryEntry) {
               if ("Quill".equals(name)) {
                  setType(metadata, "application/x-mspublisher");
                  PublisherTextExtractor extractor =
                      new PublisherTextExtractor(filesystem);
                  xhtml.element("p", extractor.getText());
               }
            } else if (entry instanceof DocumentEntry) {
               if ("WordDocument".equals(name)) {
                   setType(metadata, "application/msword");
                   WordExtractor extractor = new WordExtractor(filesystem);

                   addTextIfAny(xhtml, "header", extractor.getHeaderText());

                   for (String paragraph : extractor.getParagraphText()) {
                       xhtml.element("p", paragraph);
                   }

                   for (String paragraph : extractor.getFootnoteText()) {
                       xhtml.element("p", paragraph);
                   }

                   for (String paragraph : extractor.getCommentsText()) {
                       xhtml.element("p", paragraph);
                   }

                   for (String paragraph : extractor.getEndnoteText()) {
                       xhtml.element("p", paragraph);
                   }

                   addTextIfAny(xhtml, "footer", extractor.getFooterText());
               } else if ("PowerPoint Document".equals(name)) {
                   setType(metadata, "application/vnd.ms-powerpoint");
                   PowerPointExtractor extractor =
                       new PowerPointExtractor(filesystem);
                   xhtml.element("p", extractor.getText(true, true));
               } else if ("Workbook".equals(name)) {
                   setType(metadata, "application/vnd.ms-excel");
                   Locale locale = context.get(Locale.class, Locale.getDefault());
                   new ExcelExtractor().parse(filesystem, xhtml, locale);
               } else if ("VisioDocument".equals(name)) {
                   setType(metadata, "application/vnd.visio");
                   VisioTextExtractor extractor =
                       new VisioTextExtractor(filesystem);
                   for (String text : extractor.getAllText()) {
                       xhtml.element("p", text);
                   }
               } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
                   // TODO: Cleaner mechanism for detecting Outlook
                   outlookExtracted = true;
                   setType(metadata, "application/vnd.ms-outlook");
                   new OutlookExtractor(filesystem).parse(xhtml, metadata);
               }
            }
        }

        xhtml.endDocument();
    }
View Full Code Here

Examples of org.apache.tika.sax.XHTMLContentHandler

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        // Create handlers for the various kinds of ID3 tags
        ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);

        if (audioAndTags.tags.length > 0) {
           CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);

           metadata.set(Metadata.TITLE, tag.getTitle());
           metadata.set(Metadata.AUTHOR, tag.getArtist());
           metadata.set(XMPDM.ARTIST, tag.getArtist());
           metadata.set(XMPDM.ALBUM, tag.getAlbum());
           metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
           metadata.set(XMPDM.GENRE, tag.getGenre());
           metadata.set(XMPDM.LOG_COMMENT, tag.getComment());

           xhtml.element("h1", tag.getTitle());
           xhtml.element("p", tag.getArtist());

            // ID3v1.1 Track addition
            if (tag.getTrackNumber() != null) {
                xhtml.element("p", tag.getAlbum() + ", track " + tag.getTrackNumber());
                metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
            } else {
                xhtml.element("p", tag.getAlbum());
            }
            xhtml.element("p", tag.getYear());
            xhtml.element("p", tag.getComment());
            xhtml.element("p", tag.getGenre());
        }
        if (audioAndTags.audio != null) {
            metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
            metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
            metadata.set("version", audioAndTags.audio.getVersion());
            metadata.set(
                    XMPDM.AUDIO_SAMPLE_RATE,
                    Integer.toString(audioAndTags.audio.getSampleRate()));
        }

        xhtml.endDocument();
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.