Package org.apache.poi.hwpf.extractor

Examples of org.apache.poi.hwpf.extractor.WordExtractor


    public void test47742() throws Exception
    {

        // (1) extract text from MS Word document via POI
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47742.doc" );
        String foundText = new WordExtractor( doc ).getText();

        // (2) read text from text document (retrieved by saving the word
        // document as text file using encoding UTF-8)
        InputStream is = POIDataSamples.getDocumentInstance()
                .openResourceAsStream( "Bug47742-text.txt" );
View Full Code Here


        {
            String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
            HWPFDocument hwpfDocument = HWPFTestDataSamples
                    .openRemoteFile( href );

            WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
            wordExtractor.getText();
        }
    }
View Full Code Here

        }

        if (poifsDir.hasEntry("WordDocument")) {
            // Old or new style word document?
            try {
                return new WordExtractor(poifsDir);
            } catch (OldWordFileFormatException e) {
                return new Word6Extractor(poifsDir);
            }
        }
View Full Code Here

     
      if(entry.getName().equals("Workbook")) {
        return new ExcelExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("WordDocument")) {
        return new WordExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("PowerPoint Document")) {
        return new PowerPointExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("VisioDocument")) {
View Full Code Here

     
      if(entry.getName().equals("Workbook")) {
        return new ExcelExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("WordDocument")) {
        return new WordExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("PowerPoint Document")) {
        return new PowerPointExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("VisioDocument")) {
View Full Code Here

         } else {
            return new ExcelExtractor(poifsDir, fs);
         }
      }
      if(entry.getName().equals("WordDocument")) {
        return new WordExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("PowerPoint Document")) {
        return new PowerPointExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("VisioDocument")) {
View Full Code Here

     */
    public Reader extractText(InputStream stream,
                              String type,
                              String encoding) throws IOException {
        try {
            return new StringReader(new WordExtractor(stream).getText());
        } catch (Exception e) {
            logger.warn("Failed to extract Word text content", e);
            return new StringReader("");
        } finally {
            stream.close();
View Full Code Here

            } else if (SUMMARY_INFORMATION.equals(name)
                    || DOCUMENT_SUMMARY_INFORMATION.equals(name)) {
                parse((DocumentEntry) entry, metadata);
            } else if ("WordDocument".equals(name)) {
                setType(metadata, "application/msword");
                WordExtractor extractor = new WordExtractor(filesystem);
                for (String paragraph : extractor.getParagraphText()) {
                    xhtml.element("p", paragraph);
                }
            } else if ("PowerPoint Document".equals(name)) {
                setType(metadata, "application/vnd.ms-powerpoint");
                PowerPointExtractor extractor =
                    new PowerPointExtractor(filesystem);
                xhtml.element("p", extractor.getText(true, true));
            } else if ("Workbook".equals(name)) {
                setType(metadata, "application/vnd.ms-excel");
                new ExcelExtractor().parse(filesystem, xhtml);
            } else if ("VisioDocument".equals(name)) {
                setType(metadata, "application/vnd.visio");
                VisioTextExtractor extractor =
                    new VisioTextExtractor(filesystem);
                for (String text : extractor.getAllText()) {
                    xhtml.element("p", text);
                }
            } else if (name.startsWith("__substg1.0_")) {
                setType(metadata, "application/vnd.ms-outlook");
                new OutlookExtractor(filesystem).parse(xhtml, metadata);
View Full Code Here

     
      if(entry.getName().equals("Workbook")) {
        return new ExcelExtractor(fs);
      }
      if(entry.getName().equals("WordDocument")) {
        return new WordExtractor(fs);
      }
      if(entry.getName().equals("PowerPoint Document")) {
        return new PowerPointExtractor(fs);
      }
      if(entry.getName().equals("VisioDocument")) {
View Full Code Here

            } else if (SUMMARY_INFORMATION.equals(name)
                    || DOCUMENT_SUMMARY_INFORMATION.equals(name)) {
                parse((DocumentEntry) entry, metadata);
            } else if ("WordDocument".equals(name)) {
                setType(metadata, "application/msword");
                WordExtractor extractor = new WordExtractor(filesystem);
                for (String paragraph : extractor.getParagraphText()) {
                    xhtml.element("p", paragraph);
                }
            } else if ("PowerPoint Document".equals(name)) {
                setType(metadata, "application/vnd.ms-powerpoint");
                PowerPointExtractor extractor =
                    new PowerPointExtractor(filesystem);
                xhtml.element("p", extractor.getText(true, true));
            } else if ("Workbook".equals(name)) {
                setType(metadata, "application/vnd.ms-excel");
                new ExcelExtractor().parse(filesystem, xhtml);
            } else if ("VisioDocument".equals(name)) {
                setType(metadata, "application/vnd.visio");
                VisioTextExtractor extractor =
                    new VisioTextExtractor(filesystem);
                for (String text : extractor.getAllText()) {
                    xhtml.element("p", text);
                }
            } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
                // TODO: Cleaner mechanism for detecting Outlook
                outlookExtracted = true;
View Full Code Here

TOP

Related Classes of org.apache.poi.hwpf.extractor.WordExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.