Examples of WordExtractor

org.apache.poi.hwpf.extractor.WordExtractor
Class to extract the text from a Word Document. You should use either getParagraphText() or getText() unless you have a strong reason otherwise. @author Nick Burch (nick at torchbox dot com)
org.textmining.text.extraction.WordExtractor
This class extracts the text from a Word 97/2000/XP word doc @author Ryan Ackley
penny.parser.WordExtractor
@author john

Examples of org.apache.poi.hwpf.extractor.WordExtractor


    public Document[] parse(final MultiProtocolURI location, final String mimeType,
            final String charset, final InputStream source)
            throws Parser.Failure, InterruptedException {


        final WordExtractor extractor;


        try {
            extractor = new WordExtractor(source);
        } catch (Exception e) {
            throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
        }


        final StringBuilder contents = new StringBuilder(80);
        try {
            contents.append(extractor.getText().trim());
            contents.append(' ');
            contents.append(extractor.getHeaderText());
            contents.append(' ');
            contents.append(extractor.getFooterText());
        } catch (Exception e) {
            throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
        }
        String title = (contents.length() > 240) ? contents.substring(0,240) : contents.toString().trim();
        title.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
        if (title.length() > 80) title = title.substring(0, 80);
        int l = title.length();
        while (true) {
            title = title.replaceAll("  ", " ");
            if (title.length() == l) break;
            l = title.length();
        }


        Document[] docs;
        docs = new Document[]{new Document(
                  location,
                  mimeType,
                  "UTF-8",
                  this,
                  null,
                  null,
                  title,
                  "", // TODO: AUTHOR
                  extractor.getDocSummaryInformation().getCompany(), // publisher
                  null,
                  null,
                  0.0f, 0.0f, 
                  UTF8.getBytes(contents.toString()),
                  null,

View Full Code Here

Examples of org.apache.poi.hwpf.extractor.WordExtractor

    }
  }
  
  private void collectWordDocument(POIFSFileSystem filesystem, StringBuilder sb) 
    throws IOException {
    WordExtractor extractor = new WordExtractor(filesystem);
    addTextIfAny(sb, extractor.getHeaderText());
    for (String paragraph : extractor.getParagraphText()) {
        sb.append(paragraph).append(' ');
    }


    for (String paragraph : extractor.getFootnoteText()) {
        sb.append(paragraph).append(' ');
    }


    for (String paragraph : extractor.getCommentsText()) {
        sb.append(paragraph).append(' ');
    }


    for (String paragraph : extractor.getEndnoteText()) {
        sb.append(paragraph).append(' ');
    }
    addTextIfAny(sb, extractor.getFooterText());
  }

View Full Code Here

Examples of org.apache.poi.hwpf.extractor.WordExtractor

    @Override
    public void run() {
        InputStream isr = null;
        try {
            isr = new FileInputStream(pathToFile);
            WordExtractor word = new WordExtractor(isr);
            String fileContent = "";
            String[] paragraphes = word.getParagraphText();
            for (String paragraph : paragraphes) {
                fileContent += " " + paragraph;
            }
            AddDataToIndex AddDataToIndex = new AddDataToIndex(null);
            AddDataToIndex.doAddData(fileContent, pathToFile, fileName);

View Full Code Here

Examples of org.apache.poi.hwpf.extractor.WordExtractor

            String name = entry.getName();
            if (!(entry instanceof DocumentEntry)) {
                // Skip directory entries
            } else if ("WordDocument".equals(name)) {
                setType(metadata, "application/msword");
                WordExtractor extractor = new WordExtractor(filesystem);


                addTextIfAny(xhtml, "header", extractor.getHeaderText());


                for (String paragraph : extractor.getParagraphText()) {
                    xhtml.element("p", paragraph);
                }


                for (String paragraph : extractor.getFootnoteText()) {
                    xhtml.element("p", paragraph);
                }


                for (String paragraph : extractor.getCommentsText()) {
                    xhtml.element("p", paragraph);
                }


                for (String paragraph : extractor.getEndnoteText()) {
                    xhtml.element("p", paragraph);
                }


                addTextIfAny(xhtml, "footer", extractor.getFooterText());
            } else if ("PowerPoint Document".equals(name)) {
                setType(metadata, "application/vnd.ms-powerpoint");
                PowerPointExtractor extractor =
                    new PowerPointExtractor(filesystem);
                xhtml.element("p", extractor.getText(true, true));
            } else if ("Workbook".equals(name)) {
                setType(metadata, "application/vnd.ms-excel");
                Locale locale = context.get(Locale.class, Locale.getDefault());
                new ExcelExtractor().parse(filesystem, xhtml, locale);
            } else if ("VisioDocument".equals(name)) {
                setType(metadata, "application/vnd.visio");
                VisioTextExtractor extractor =
                    new VisioTextExtractor(filesystem);
                for (String text : extractor.getAllText()) {
                    xhtml.element("p", text);
                }
            } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
                // TODO: Cleaner mechanism for detecting Outlook
                outlookExtracted = true;

View Full Code Here

Examples of org.apache.poi.hwpf.extractor.WordExtractor

      
      if(entry.getName().equals("Workbook")) {
        return new ExcelExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("WordDocument")) {
        return new WordExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("PowerPoint Document")) {
        return new PowerPointExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("VisioDocument")) {

View Full Code Here

Examples of org.apache.poi.hwpf.extractor.WordExtractor

                  xhtml.element("p", extractor.getText());
               }
            } else if (entry instanceof DocumentEntry) {
               if ("WordDocument".equals(name)) {
                   setType(metadata, "application/msword");
                   WordExtractor extractor = new WordExtractor(filesystem);


                   addTextIfAny(xhtml, "header", extractor.getHeaderText());


                   for (String paragraph : extractor.getParagraphText()) {
                       xhtml.element("p", paragraph);
                   }


                   for (String paragraph : extractor.getFootnoteText()) {
                       xhtml.element("p", paragraph);
                   }


                   for (String paragraph : extractor.getCommentsText()) {
                       xhtml.element("p", paragraph);
                   }


                   for (String paragraph : extractor.getEndnoteText()) {
                       xhtml.element("p", paragraph);
                   }


                   addTextIfAny(xhtml, "footer", extractor.getFooterText());
               } else if ("PowerPoint Document".equals(name)) {
                   setType(metadata, "application/vnd.ms-powerpoint");
                   PowerPointExtractor extractor =
                       new PowerPointExtractor(filesystem);
                   xhtml.element("p", extractor.getText(true, true));
               } else if ("Workbook".equals(name)) {
                   setType(metadata, "application/vnd.ms-excel");
                   Locale locale = context.get(Locale.class, Locale.getDefault());
                   new ExcelExtractor().parse(filesystem, xhtml, locale);
               } else if ("VisioDocument".equals(name)) {
                   setType(metadata, "application/vnd.visio");
                   VisioTextExtractor extractor =
                       new VisioTextExtractor(filesystem);
                   for (String text : extractor.getAllText()) {
                       xhtml.element("p", text);
                   }
               } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
                   // TODO: Cleaner mechanism for detecting Outlook
                   outlookExtracted = true;

View Full Code Here

Examples of org.apache.poi.hwpf.extractor.WordExtractor

    } else {
      throw new IllegalArgumentException("Parameter must be instance of byte[]");
    }
    String ret = null;
    try {
      WordExtractor docextractor = new WordExtractor(is);
      ret = docextractor.getText();
    } catch (OldWordFileFormatException e) {
      try {
        is.reset();
        Word6Extractor docextractor = new Word6Extractor(is);
        ret = docextractor.getText();
      } catch (IOException e1) {
        throw new CRException(e1);
      }


    } catch (IOException e) {

View Full Code Here

Examples of org.apache.poi.hwpf.extractor.WordExtractor

      
      if(entry.getName().equals("Workbook")) {
        return new ExcelExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("WordDocument")) {
        return new WordExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("PowerPoint Document")) {
        return new PowerPointExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("VisioDocument")) {

View Full Code Here

Examples of org.apache.poi.hwpf.extractor.WordExtractor

            String name = entry.getName();
            if (!(entry instanceof DocumentEntry)) {
                // Skip directory entries
            } else if ("WordDocument".equals(name)) {
                setType(metadata, "application/msword");
                WordExtractor extractor = new WordExtractor(filesystem);


                addTextIfAny(xhtml, "header", extractor.getHeaderText());


                for (String paragraph : extractor.getParagraphText()) {
                    xhtml.element("p", paragraph);
                }


                for (String paragraph : extractor.getFootnoteText()) {
                    xhtml.element("p", paragraph);
                }


                for (String paragraph : extractor.getCommentsText()) {
                    xhtml.element("p", paragraph);
                }


                for (String paragraph : extractor.getEndnoteText()) {
                    xhtml.element("p", paragraph);
                }


                addTextIfAny(xhtml, "footer", extractor.getFooterText());
            } else if ("PowerPoint Document".equals(name)) {
                setType(metadata, "application/vnd.ms-powerpoint");
                PowerPointExtractor extractor =
                    new PowerPointExtractor(filesystem);
                xhtml.element("p", extractor.getText(true, true));
            } else if ("Workbook".equals(name)) {
                setType(metadata, "application/vnd.ms-excel");
                new ExcelExtractor().parse(filesystem, xhtml);
            } else if ("VisioDocument".equals(name)) {
                setType(metadata, "application/vnd.visio");
                VisioTextExtractor extractor =
                    new VisioTextExtractor(filesystem);
                for (String text : extractor.getAllText()) {
                    xhtml.element("p", text);
                }
            } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
                // TODO: Cleaner mechanism for detecting Outlook
                outlookExtracted = true;

View Full Code Here

Examples of org.apache.poi.hwpf.extractor.WordExtractor

     * Bug 33519 - HWPF fails to read a file
     */
    public void test33519()
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug33519.doc" );
        WordExtractor extractor = new WordExtractor( doc );
        extractor.getText();
    }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.