Package org.apache.tika.parser.xml

Examples of org.apache.tika.parser.xml.XMLParser


    public void parse(
            InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        Document xmlDoc = parse(stream);
        XMLParser xp = new XMLParser();
        xp.getAllDocumentNs(xmlDoc);
        xp.extractContent(xmlDoc, Metadata.TITLE, "//dc:title", metadata);
        xp.extractContent(xmlDoc, Metadata.SUBJECT, "//dc:subject", metadata);
        xp.extractContent(xmlDoc, Metadata.CREATOR, "//dc:creator", metadata);
        xp.extractContent(xmlDoc, Metadata.DESCRIPTION, "//dc:description", metadata);
        xp.extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language", metadata);
        xp.extractContent(xmlDoc, Metadata.KEYWORDS, "//meta:keyword", metadata);
        xp.extractContent(xmlDoc, Metadata.DATE, "//dc:date", metadata);
        xp.extractContent(xmlDoc, "nbTab", "//meta:document-statistic/@meta:table-count", metadata);
        xp.extractContent(xmlDoc, "nbObject", "//meta:document-statistic/@meta:object-count", metadata);
        xp.extractContent(xmlDoc, "nbImg", "//meta:document-statistic/@meta:image-count", metadata);
        xp.extractContent(xmlDoc, "nbPage", "//meta:document-statistic/@meta:page-count", metadata);
        xp.extractContent(xmlDoc, "nbPara", "//meta:document-statistic/@meta:paragraph-count", metadata);
        xp.extractContent(xmlDoc, "nbWord", "//meta:document-statistic/@meta:word-count", metadata);
        xp.extractContent(xmlDoc, "nbcharacter", "//meta:document-statistic/@meta:character-count", metadata);

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("p");
        xp.concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
        xhtml.endElement("p");
        xhtml.endDocument();
    }
View Full Code Here


            } else if (name.equals("org.apache.jackrabbit.extractor.RTFTextExtractor")) {
                Parser parser = new RTFParser();
                parsers.put(MediaType.application("rtf"), parser);
                parsers.put(MediaType.text("rtf"), parser);
            } else if (name.equals("org.apache.jackrabbit.extractor.XMLTextExtractor")) {
                Parser parser = new XMLParser();
                parsers.put(MediaType.APPLICATION_XML, parser);
                parsers.put(MediaType.text("xml"), parser);
            } else {
                logger.warn("Ignoring unknown text extractor class: {}", name);
            }
        }

        parser.setParsers(parsers);
    }
View Full Code Here

                Parser parser = new RTFParser();
                parsers.put("application/rtf", parser);
                parsers.put("text/rtf", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.XMLTextExtractor")) {
                Parser parser = new XMLParser();
                parsers.put("application/xml", parser);
                parsers.put("text/xml", parser);
            } else {
                logger.warn("Ignoring unknown text extractor class: {}", name);
            }
        }

        parser.setParsers(parsers);
    }
View Full Code Here

                Parser parser = new RTFParser();
                parsers.put("application/rtf", parser);
                parsers.put("text/rtf", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.XMLTextExtractor")) {
                Parser parser = new XMLParser();
                parsers.put("application/xml", parser);
                parsers.put("text/xml", parser);
            } else {
                logger.warn("Ignoring unknown text extractor class: {}", name);
            }
        }

        parser.setParsers(parsers);
    }
View Full Code Here

                Parser parser = new RTFParser();
                parsers.put("application/rtf", parser);
                parsers.put("text/rtf", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.XMLTextExtractor")) {
                Parser parser = new XMLParser();
                parsers.put("application/xml", parser);
                parsers.put("text/xml", parser);
            } else {
                logger.warn("Ignoring unknown text extractor class: {}", name);
            }
        }

        parser.setParsers(parsers);
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.xml.XMLParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.