Source Code of org.apache.poi.hwpf.extractor.WordExtractor

package org.apache.poi.hwpf.extractor;


import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;


import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;


/**
 * Class to extract the text from a Word Document.
 * 
 * You should use either getParagraphText() or getText() unless
 *  you have a strong reason otherwise.
 *
 * @author Nick Burch (nick at torchbox dot com)
 */
public class WordExtractor {
  private POIFSFileSystem fs;
  private HWPFDocument doc;
  
  /**
   * Create a new Word Extractor
   * @param is InputStream containing the word file
   */
  public WordExtractor(InputStream is) throws IOException {
    this(new POIFSFileSystem(is));
  }


  /**
   * Create a new Word Extractor
   * @param fs POIFSFileSystem containing the word file
   */
  public WordExtractor(POIFSFileSystem fs) throws IOException {
    this(new HWPFDocument(fs));
    this.fs = fs;
  }
  
  /**
   * Create a new Word Extractor
   * @param doc The HWPFDocument to extract from
   */
  public WordExtractor(HWPFDocument doc) throws IOException {
    this.doc = doc;
  }
  
  /**
   * Get the text from the word file, as an array with one String
   *  per paragraph
   */
  public String[] getParagraphText() {
    String[] ret;
    
    // Extract using the model code
    try {
        Range r = doc.getRange();


      ret = new String[r.numParagraphs()];
      for(int i=0; i<ret.length; i++) {
        Paragraph p = r.getParagraph(i);
        ret[i] = p.text();
        
        // Fix the line ending
        if(ret[i].endsWith("\r")) {
          ret[i] = ret[i] + "\n";
        }
      }
    } catch(Exception e) {
      // Something's up with turning the text pieces into paragraphs
      // Fall back to ripping out the text pieces
      ret = new String[1];
      ret[0] = getTextFromPieces();
    }
    
    return ret;
  }
  
  /**
   * Grab the text out of the text pieces. Might also include various
   *  bits of crud, but will work in cases where the text piece -> paragraph
   *  mapping is broken. Fast too.
   */
  public String getTextFromPieces() {
      StringBuffer textBuf = new StringBuffer();
      
      Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
      while (textPieces.hasNext()) {
        TextPiece piece = (TextPiece) textPieces.next();


        String encoding = "Cp1252";
        if (piece.usesUnicode()) {
          encoding = "UTF-16LE";
        }
        try {
          String text = new String(piece.getRawBytes(), encoding);
          textBuf.append(text);
        } catch(UnsupportedEncodingException e) {
          throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
        }
      }
      
      String text = textBuf.toString();
      
      // Fix line endings (Note - won't get all of them
      text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
      text = text.replaceAll("\r\r", "\r\n\r\n");
      
      if(text.endsWith("\r")) {
        text += "\n";
      }
      
      return text;
  }
  
  /**
   * Grab the text, based on the paragraphs. Shouldn't include any crud,
   *  but slightly slower than getTextFromPieces().
   */
  public String getText() {
    StringBuffer ret = new StringBuffer();
    String[] text = getParagraphText();
    for(int i=0; i<text.length; i++) {
      ret.append(text[i]);
    }
    return ret.toString();
  }
}
Source Code of org.apache.poi.hwpf.extractor.WordExtractor

Related Classes of org.apache.poi.hwpf.extractor.WordExtractor