Package org.apache.tika.parser.microsoft

Examples of org.apache.tika.parser.microsoft.OfficeParser


            String name = tokenizer.nextToken();
            if (name.equals(
                    "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
                parsers.put(MediaType.text("html"), new HtmlParser());
            } else if (name.equals("org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put(MediaType.application("vnd.ms-excel"), parser);
                parsers.put(MediaType.application("msexcel"), parser);
                parsers.put(MediaType.application("excel"), parser);
            } else if (name.equals("org.apache.jackrabbit.extractor.MsOutlookTextExtractor")) {
                parsers.put(MediaType.application("vnd.ms-outlook"), new OfficeParser());
            } else if (name.equals("org.apache.jackrabbit.extractor.MsPowerPointExtractor")
                    || name.equals("org.apache.jackrabbit.extractor.MsPowerPointTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put(MediaType.application("vnd.ms-powerpoint"), parser);
                parsers.put(MediaType.application("mspowerpoint"), parser);
                parsers.put(MediaType.application("powerpoint"), parser);
            } else if (name.equals("org.apache.jackrabbit.extractor.MsWordTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put(MediaType.application("vnd.ms-word"), parser);
                parsers.put(MediaType.application("msword"), parser);
            } else if (name.equals("org.apache.jackrabbit.extractor.MsTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put(MediaType.application("vnd.ms-word"), parser);
                parsers.put(MediaType.application("msword"), parser);
                parsers.put(MediaType.application("vnd.ms-powerpoint"), parser);
                parsers.put(MediaType.application("mspowerpoint"), parser);
                parsers.put(MediaType.application("vnd.ms-excel"), parser);
                parsers.put(MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"), parser);
                parsers.put(MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"), parser);
                parsers.put(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"), parser);
            } else if (name.equals("org.apache.jackrabbit.extractor.OpenOfficeTextExtractor")) {
                Parser parser = new OpenDocumentParser();
                parsers.put(MediaType.application("vnd.oasis.opendocument.database"), parser);
                parsers.put(MediaType.application("vnd.oasis.opendocument.formula"), parser);
                parsers.put(MediaType.application("vnd.oasis.opendocument.graphics"), parser);
                parsers.put(MediaType.application("vnd.oasis.opendocument.presentation"), parser);
                parsers.put(MediaType.application("vnd.oasis.opendocument.spreadsheet"), parser);
                parsers.put(MediaType.application("vnd.oasis.opendocument.text"), parser);
                parsers.put(MediaType.application("vnd.sun.xml.calc"), parser);
                parsers.put(MediaType.application("vnd.sun.xml.draw"), parser);
                parsers.put(MediaType.application("vnd.sun.xml.impress"), parser);
                parsers.put(MediaType.application("vnd.sun.xml.writer"), parser);
            } else if (name.equals("org.apache.jackrabbit.extractor.PdfTextExtractor")) {
                parsers.put(MediaType.application("pdf"), new PDFParser());
            } else if (name.equals("org.apache.jackrabbit.extractor.PlainTextExtractor")) {
                parsers.put(MediaType.TEXT_PLAIN, new TXTParser());
            } else if (name.equals("org.apache.jackrabbit.extractor.PngTextExtractor")) {
                Parser parser = new ImageParser();
                parsers.put(MediaType.image("png"), parser);
                parsers.put(MediaType.image("apng"), parser);
                parsers.put(MediaType.image("mng"), parser);
            } else if (name.equals("org.apache.jackrabbit.extractor.RTFTextExtractor")) {
                Parser parser = new RTFParser();
                parsers.put(MediaType.application("rtf"), parser);
                parsers.put(MediaType.text("rtf"), parser);
            } else if (name.equals("org.apache.jackrabbit.extractor.XMLTextExtractor")) {
                Parser parser = new XMLParser();
                parsers.put(MediaType.APPLICATION_XML, parser);
                parsers.put(MediaType.text("xml"), parser);
            } else {
                logger.warn("Ignoring unknown text extractor class: {}", name);
            }
        }

        parser.setParsers(parsers);
    }
View Full Code Here


            if (name.equals(
                    "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
                parsers.put("text/html", new HtmlParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/msexcel", parser);
                parsers.put("application/excel", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsOutlookTextExtractor")) {
                parsers.put("application/vnd.ms-outlook", new OfficeParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsPowerPointExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-powerpoint", parser);
                parsers.put("application/mspowerpoint", parser);
                parsers.put("application/powerpoint", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsWordTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-word", parser);
                parsers.put("application/msword", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-word", parser);
                parsers.put("application/msword", parser);
                parsers.put("application/vnd.ms-powerpoint", parser);
                parsers.put("application/mspowerpoint", parser);
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/vnd.openxmlformats-officedocument.wordprocessingml.document", parser);
                parsers.put("application/vnd.openxmlformats-officedocument.presentationml.presentation", parser);
                parsers.put("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.OpenOfficeTextExtractor")) {
                Parser parser = new OpenOfficeParser();
                parsers.put("application/vnd.oasis.opendocument.database", parser);
                parsers.put("application/vnd.oasis.opendocument.formula", parser);
                parsers.put("application/vnd.oasis.opendocument.graphics", parser);
                parsers.put("application/vnd.oasis.opendocument.presentation", parser);
                parsers.put("application/vnd.oasis.opendocument.spreadsheet", parser);
                parsers.put("application/vnd.oasis.opendocument.text", parser);
                parsers.put("application/vnd.sun.xml.calc", parser);
                parsers.put("application/vnd.sun.xml.draw", parser);
                parsers.put("application/vnd.sun.xml.impress", parser);
                parsers.put("application/vnd.sun.xml.writer", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.PdfTextExtractor")) {
                parsers.put("application/pdf", new PDFParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.PlainTextExtractor")) {
                parsers.put("text/plain", new TXTParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.PngTextExtractor")) {
                Parser parser = new ImageParser();
                parsers.put("image/png", parser);
                parsers.put("image/apng", parser);
                parsers.put("image/mng", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.RTFTextExtractor")) {
                Parser parser = new RTFParser();
                parsers.put("application/rtf", parser);
                parsers.put("text/rtf", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.XMLTextExtractor")) {
                Parser parser = new XMLParser();
                parsers.put("application/xml", parser);
                parsers.put("text/xml", parser);
            } else {
                logger.warn("Ignoring unknown text extractor class: {}", name);
            }
        }

        parser.setParsers(parsers);
    }
View Full Code Here

    private static void initialize() {
        // No particular parsing context is needed
        ParseContext parseContext = new ParseContext();

        // MS Office Binary File Format
        addConverter( new OfficeParser().getSupportedTypes( parseContext ),
                MSOfficeBinaryConverter.class );

        // Rich Text Format
        addConverter( new RTFParser().getSupportedTypes( parseContext ), RTFConverter.class );
View Full Code Here

            if (name.equals(
                    "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
                parsers.put("text/html", new HtmlParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/msexcel", parser);
                parsers.put("application/excel", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsOutlookTextExtractor")) {
                parsers.put("application/vnd.ms-outlook", new OfficeParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsPowerPointExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-powerpoint", parser);
                parsers.put("application/mspowerpoint", parser);
                parsers.put("application/powerpoint", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsWordTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-word", parser);
                parsers.put("application/msword", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-word", parser);
                parsers.put("application/msword", parser);
                parsers.put("application/vnd.ms-powerpoint", parser);
                parsers.put("application/mspowerpoint", parser);
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/vnd.openxmlformats-officedocument.wordprocessingml.document", parser);
                parsers.put("application/vnd.openxmlformats-officedocument.presentationml.presentation", parser);
                parsers.put("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.OpenOfficeTextExtractor")) {
                Parser parser = new OpenDocumentParser();
                parsers.put("application/vnd.oasis.opendocument.database", parser);
                parsers.put("application/vnd.oasis.opendocument.formula", parser);
                parsers.put("application/vnd.oasis.opendocument.graphics", parser);
                parsers.put("application/vnd.oasis.opendocument.presentation", parser);
                parsers.put("application/vnd.oasis.opendocument.spreadsheet", parser);
                parsers.put("application/vnd.oasis.opendocument.text", parser);
                parsers.put("application/vnd.sun.xml.calc", parser);
                parsers.put("application/vnd.sun.xml.draw", parser);
                parsers.put("application/vnd.sun.xml.impress", parser);
                parsers.put("application/vnd.sun.xml.writer", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.PdfTextExtractor")) {
                parsers.put("application/pdf", new PDFParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.PlainTextExtractor")) {
                parsers.put("text/plain", new TXTParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.PngTextExtractor")) {
                Parser parser = new ImageParser();
                parsers.put("image/png", parser);
                parsers.put("image/apng", parser);
                parsers.put("image/mng", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.RTFTextExtractor")) {
                Parser parser = new RTFParser();
                parsers.put("application/rtf", parser);
                parsers.put("text/rtf", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.XMLTextExtractor")) {
                Parser parser = new XMLParser();
                parsers.put("application/xml", parser);
                parsers.put("text/xml", parser);
            } else {
                logger.warn("Ignoring unknown text extractor class: {}", name);
            }
        }

        parser.setParsers(parsers);
    }
View Full Code Here

            if (name.equals(
                    "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
                parsers.put("text/html", new HtmlParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/msexcel", parser);
                parsers.put("application/excel", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsOutlookTextExtractor")) {
                parsers.put("application/vnd.ms-outlook", new OfficeParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsPowerPointExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-powerpoint", parser);
                parsers.put("application/mspowerpoint", parser);
                parsers.put("application/powerpoint", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsWordTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-word", parser);
                parsers.put("application/msword", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-word", parser);
                parsers.put("application/msword", parser);
                parsers.put("application/vnd.ms-powerpoint", parser);
                parsers.put("application/mspowerpoint", parser);
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/vnd.openxmlformats-officedocument.wordprocessingml.document", parser);
                parsers.put("application/vnd.openxmlformats-officedocument.presentationml.presentation", parser);
                parsers.put("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.OpenOfficeTextExtractor")) {
                Parser parser = new OpenDocumentParser();
                parsers.put("application/vnd.oasis.opendocument.database", parser);
                parsers.put("application/vnd.oasis.opendocument.formula", parser);
                parsers.put("application/vnd.oasis.opendocument.graphics", parser);
                parsers.put("application/vnd.oasis.opendocument.presentation", parser);
                parsers.put("application/vnd.oasis.opendocument.spreadsheet", parser);
                parsers.put("application/vnd.oasis.opendocument.text", parser);
                parsers.put("application/vnd.sun.xml.calc", parser);
                parsers.put("application/vnd.sun.xml.draw", parser);
                parsers.put("application/vnd.sun.xml.impress", parser);
                parsers.put("application/vnd.sun.xml.writer", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.PdfTextExtractor")) {
                parsers.put("application/pdf", new PDFParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.PlainTextExtractor")) {
                parsers.put("text/plain", new TXTParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.PngTextExtractor")) {
                Parser parser = new ImageParser();
                parsers.put("image/png", parser);
                parsers.put("image/apng", parser);
                parsers.put("image/mng", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.RTFTextExtractor")) {
                Parser parser = new RTFParser();
                parsers.put("application/rtf", parser);
                parsers.put("text/rtf", parser);
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.XMLTextExtractor")) {
                Parser parser = new XMLParser();
                parsers.put("application/xml", parser);
                parsers.put("text/xml", parser);
            } else {
                logger.warn("Ignoring unknown text extractor class: {}", name);
            }
        }

        parser.setParsers(parsers);
    }
View Full Code Here

    InputStream input;
    try {
      input = new FileInputStream(new File(f.fileName()));
      ContentHandler textHandler = new BodyContentHandler(-1);
      Metadata metadata = new Metadata();
      OfficeParser parser = new OfficeParser()
      ParseContext context = new ParseContext();
      parser.parse(input, textHandler, metadata, context);
      String[] result = textHandler.toString().split(regex);
      for (int i=0; i<result.length && keepRunning; i++) {
        if (interrupt) {
          processInterrupt();
        }
View Full Code Here

    private static void initialize() {
        // No particular parsing context is needed
        ParseContext parseContext = new ParseContext();

        // MS Office Binary File Format
        addConverter( new OfficeParser().getSupportedTypes( parseContext ),
                MSOfficeBinaryConverter.class );

        // Rich Text Format
        addConverter( new RTFParser().getSupportedTypes( parseContext ), RTFConverter.class );
View Full Code Here

        InputStream input = SolidworksParserTest.class.getResourceAsStream(
                "/test-documents/testsolidworksPart2013SP2.SLDPRT");
        try {
            ContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            new OfficeParser().parse(input, handler, metadata, new ParseContext());

            //Check content type
            assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
            
            //Check properties
View Full Code Here

        InputStream input = SolidworksParserTest.class.getResourceAsStream(
                "/test-documents/testsolidworksPart2014SP0.SLDPRT");
        try {
            ContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            new OfficeParser().parse(input, handler, metadata, new ParseContext());

            //Check content type
            assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
           
            //Check properties
View Full Code Here

        InputStream input = SolidworksParserTest.class.getResourceAsStream(
                "/test-documents/testsolidworksAssembly2013SP2.SLDASM");
        try {
            ContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            new OfficeParser().parse(input, handler, metadata, new ParseContext());

            //Check content type
            assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
           
            //Check properties
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.microsoft.OfficeParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.