Examples of org.apache.tika.sax.XHTMLContentHandler

org.apache.tika.sax.XHTMLContentHandler
Content handler decorator that simplifies the task of producing XHTML events for Tika content parsers.


    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler,metadata);


        try {
            SAXParserFactory factory = SAXParserFactory.newInstance();
            factory.setValidating(false);
            factory.setNamespaceAware(true);

View Full Code Here

        // First up, which version of the format are we handling?
        byte[] header = new byte[128];
        IOUtils.readFully(stream, header);
        String version = new String(header, 0, 6, "US-ASCII");


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        if (version.equals("AC1015")) {
            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
            if (skipTo2000PropertyInfoSection(stream, header)) {
                get2000Props(stream,metadata,xhtml);
            }
        } else if (version.equals("AC1018")) {
            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
            if (skipToPropertyInfoSection(stream, header)) {
                get2004Props(stream,metadata,xhtml);
            }
        } else if (version.equals("AC1021") || version.equals("AC1024")) {
            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
            if (skipToPropertyInfoSection(stream, header)) {
                get2007and2010Props(stream,metadata,xhtml);
            }
        } else {
            throw new TikaException(
                    "Unsupported AutoCAD drawing version: " + version);
        }


        xhtml.endDocument();
    }

View Full Code Here

     */
    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        final DirectoryNode root;
        TikaInputStream tstream = TikaInputStream.cast(stream);
        if (tstream == null) {
            root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
        } else {
            final Object container = tstream.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                root = ((NPOIFSFileSystem) container).getRoot();
            } else if (container instanceof DirectoryNode) {
                root = (DirectoryNode) container;
            } else if (tstream.hasFile()) {
                root = new NPOIFSFileSystem(tstream.getFileChannel()).getRoot();
            } else {
                root = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)).getRoot();
            }
        }
        parse(root, context, metadata, xhtml);
        xhtml.endDocument();
    }

View Full Code Here

        // Next is the Layer and Mask Info
        // Finally we have Image Data
        // We can't do anything with these parts
        
        // We don't have any helpful text, sorry...
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }

View Full Code Here

       addMetadataByString( metadata, "FontWeight", fontMetrics.getWeight() );
       addMetadataByString( metadata, "FontNotice", fontMetrics.getNotice() );
       addMetadataByString( metadata, "FontUnderlineThickness", Float.toString( fontMetrics.getUnderlineThickness() ) );


       // Output the remaining comments as text
       XHTMLContentHandler xhtml = new XHTMLContentHandler( handler, metadata );
       xhtml.startDocument();


       // Display the comments
       if (comments.size() > 0) {
          xhtml.element( "h1", "Comments" );
          xhtml.startElement("div", "class", "comments");
          for (String comment : comments) {
              xhtml.element( "p", comment );
          }
          xhtml.endElement("div");
       }


       xhtml.endDocument();
    }

View Full Code Here

     *      org.apache.tika.metadata.Metadata)
     */
    public void getXHTML(
            ContentHandler handler, Metadata metadata, ParseContext context)
            throws SAXException, XmlException, IOException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        buildXHTML(xhtml);


        // Now do any embedded parts
        handleEmbeddedParts(handler);


        xhtml.endDocument();
    }

View Full Code Here

        metadata.set(TikaCoreProperties.CREATED, font.getHeader().getCreated().getTime());
        metadata.set(
                TikaCoreProperties.MODIFIED,
                font.getHeader().getModified().getTime());


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.endDocument();
    }

View Full Code Here

        BufferedReader reader = new BufferedReader(isr);


        metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
        metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");


        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();


        ParseStates parseState = ParseStates.START;
        String multiLine = null;
        boolean inQuote = false;
        int numEmails = 0;


        // We're going to scan, line-by-line, for a line that starts with
        // "From "
        for (String curLine = reader.readLine(); curLine != null; curLine = reader.readLine()) {
            boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
            if (newMessage) {
                numEmails += 1;
            }


            switch (parseState) {
            case START:
                if (newMessage) {
                    parseState = ParseStates.IN_HEADER;
                    newMessage = false;
                    // Fall through to IN_HEADER
                } else {
                    break;
                }


            case IN_HEADER:
                if (newMessage) {
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    multiLine = curLine;
                } else if (curLine.length() == 0) {
                    // Blank line is signal that we're transitioning to the content.
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    parseState = ParseStates.IN_CONTENT;


                    // Mimic what PackageParser does between entries.
                    xhtml.startElement("div", "class", "email-entry");
                    xhtml.startElement("p");
                    inQuote = false;
                } else if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
                    multiLine += " " + curLine.trim();
                } else {
                    saveHeaderInMetadata(numEmails, metadata, multiLine);
                    multiLine = curLine;
                }


                break;


                // TODO - use real email parsing support so we can correctly handle
                // things like multipart messages and quoted-printable encoding.
                // We'd also want this for charset handling, where content isn't 7-bit
                // ascii.
            case IN_CONTENT:
                if (newMessage) {
                    endMessage(xhtml, inQuote);
                    parseState = ParseStates.IN_HEADER;
                    multiLine = curLine;
                } else {
                    boolean quoted = curLine.startsWith(">");
                    if (inQuote) {
                        if (!quoted) {
                            xhtml.endElement("q");
                            inQuote = false;
                        }
                    } else if (quoted) {
                        xhtml.startElement("q");
                        inQuote = true;
                    }


                    xhtml.characters(curLine);


                    // For plain text email, each line is a real break position.
                    xhtml.element("br", "");
                }
            }
        }


        if (parseState == ParseStates.IN_HEADER) {
            saveHeaderInMetadata(numEmails, metadata, multiLine);
        } else if (parseState == ParseStates.IN_CONTENT) {
            endMessage(xhtml, inQuote);
        }


        xhtml.endDocument();
    }

View Full Code Here

            throws IOException, SAXException, TikaException {
        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
            metadata.set(Metadata.CONTENT_TYPE, "application/xml");
        }


        final XHTMLContentHandler xhtml =
            new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");


        TaggedContentHandler tagged = new TaggedContentHandler(handler);
        try {
            context.getSAXParser().parse(
                    new CloseShieldInputStream(stream),
                    new OfflineContentHandler(new EmbeddedContentHandler(
                            getContentHandler(tagged, metadata, context))));
        } catch (SAXException e) {
            tagged.throwIfCauseOf(e);
            throw new TikaException("XML parse error", e);
        }


        xhtml.endElement("p");
        xhtml.endDocument();
    }

View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // We only do metadata, for now
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);


        // What kind is it?
        byte[] first4 = new byte[4];
        IOUtils.readFully(stream, first4);
        
        if (first4[0] == (byte)'M' && first4[1] == (byte)'Z') {
           parsePE(xhtml, metadata, stream, first4);
        } else if (first4[0] == (byte)0x7f && first4[1] == (byte)'E' &&
                   first4[2] == (byte)'L' && first4[3] == (byte)'F') {
           parseELF(xhtml, metadata, stream, first4);
        }
        
        
        // Finish everything
        xhtml.endDocument();
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.tika.sax.XHTMLContentHandler

de.innosystec.unrar.tika.RARParser

org.apache.jackrabbit.core.query.lucene.BlockingParser

org.apache.jackrabbit.core.query.pdf.PDF2XHTML

org.apache.jackrabbit.oak.http.HtmlRepresentation

org.apache.tika.fork.ForkTestParser

org.apache.tika.parser.asm.XHTMLClassVisitor

org.apache.tika.parser.audio.AudioParser

org.apache.tika.parser.audio.MidiParser

org.apache.tika.parser.chm.ChmParser

org.apache.tika.parser.dwg.DWGParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.