Examples of org.apache.poi.hwpf.extractor.WordExtractor

org.apache.poi.hwpf.extractor.WordExtractor
Class to extract the text from a Word Document. You should use either getParagraphText() or getText() unless you have a strong reason otherwise. @author Nick Burch (nick at torchbox dot com)

     * missing
     */
    public void test46817()
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug46817.doc" );
        WordExtractor extractor = new WordExtractor( doc );
        String text = extractor.getText().trim();


        assertTrue( text.contains( "Nazwa wykonawcy" ) );
        assertTrue( text.contains( "kujawsko-pomorskie" ) );
        assertTrue( text.contains( "ekomel@ekomel.com.pl" ) );
    }

View Full Code Here

     * @throws IOException
     */
    public void test47286() throws IOException
    {
        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug47286.doc" );
        String text1 = new WordExtractor( doc1 ).getText().trim();


        HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack( doc1 );
        String text2 = new WordExtractor( doc2 ).getText().trim();


        // the text in the saved document has some differences in line
        // separators but we tolerate that
        assertEquals( text1.replaceAll( "\n", "" ), text2.replaceAll( "\n", "" ) );

View Full Code Here

     * some website as an embedded object
     */
    public void test47731() throws Exception
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" );
        String foundText = new WordExtractor( doc ).getText();


        assertTrue( foundText
                .contains( "Soak the rice in water for three to four hours" ) );
    }

View Full Code Here

    public void test47742() throws Exception
    {


        // (1) extract text from MS Word document via POI
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47742.doc" );
        String foundText = new WordExtractor( doc ).getText();


        // (2) read text from text document (retrieved by saving the word
        // document as text file using encoding UTF-8)
        InputStream is = POIDataSamples.getDocumentInstance()
                .openResourceAsStream( "Bug47742-text.txt" );

View Full Code Here

        {
            String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
            HWPFDocument hwpfDocument = HWPFTestDataSamples
                    .openRemoteFile( href );


            WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
            wordExtractor.getText();
        }
    }

View Full Code Here

      
      if(entry.getName().equals("Workbook")) {
        return new ExcelExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("WordDocument")) {
        return new WordExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("PowerPoint Document")) {
        return new PowerPointExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("VisioDocument")) {

View Full Code Here


    public Document[] parse(final MultiProtocolURI location, final String mimeType,
            final String charset, final InputStream source)
            throws Parser.Failure, InterruptedException {


        final WordExtractor extractor;


        try {
            extractor = new WordExtractor(source);
        } catch (Exception e) {
            throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
        }


        final StringBuilder contents = new StringBuilder(80);
        try {
            contents.append(extractor.getText().trim());
            contents.append(' ');
            contents.append(extractor.getHeaderText());
            contents.append(' ');
            contents.append(extractor.getFooterText());
        } catch (Exception e) {
            throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
        }
        String title = (contents.length() > 240) ? contents.substring(0,240) : contents.toString().trim();
        title.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
        if (title.length() > 80) title = title.substring(0, 80);
        int l = title.length();
        while (true) {
            title = title.replaceAll("  ", " ");
            if (title.length() == l) break;
            l = title.length();
        }


        Document[] docs;
        docs = new Document[]{new Document(
                  location,
                  mimeType,
                  "UTF-8",
                  this,
                  null,
                  null,
                  title,
                  "", // TODO: AUTHOR
                  extractor.getDocSummaryInformation().getCompany(), // publisher
                  null,
                  null,
                  0.0f, 0.0f, 
                  UTF8.getBytes(contents.toString()),
                  null,

View Full Code Here

     * Bug 33519 - HWPF fails to read a file
     */
    public void test33519()
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug33519.doc" );
        WordExtractor extractor = new WordExtractor( doc );
        extractor.getText();
    }

View Full Code Here

     * Bug 34898 - WordExtractor doesn't read the whole string from the file
     */
    public void test34898()
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug34898.doc" );
        WordExtractor extractor = new WordExtractor( doc );
        assertEquals( "\u30c7\u30a3\u30ec\u30af\u30c8\u30ea", extractor
                .getText().trim() );
    }

View Full Code Here

     * Bug 44331 - HWPFDocument.write destroys fields
     */
    public void test44431()
    {
        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
        WordExtractor extractor1 = new WordExtractor( doc1 );


        HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack( doc1 );
        WordExtractor extractor2 = new WordExtractor( doc2 );


        assertEquals( extractor1.getFooterText(), extractor2.getFooterText() );
        assertEquals( extractor1.getHeaderText(), extractor2.getHeaderText() );
        assertEquals( Arrays.toString( extractor1.getParagraphText() ),
                Arrays.toString( extractor2.getParagraphText() ) );


        assertEquals( extractor1.getText(), extractor2.getText() );
    }

View Full Code Here

0 1 2 3 4 5 6 7 8

TOP

Related Classes of org.apache.poi.hwpf.extractor.WordExtractor

com.gentics.cr.lucene.indexer.transformer.doc.DOCContentTransformer

com.google.code.ftspc.lector.parsers.POI.DocParser

net.yacy.document.parser.docParser

org.apache.jackrabbit.extractor.MsWordTextExtractor

org.apache.poi.extractor.ExtractorFactory

org.apache.poi.hwpf.converter.WordToTextConverter

org.apache.poi.hwpf.HWPFDocument

org.apache.poi.hwpf.model.TextPiece

org.apache.poi.hwpf.usermodel.HeaderStories

org.apache.poi.hwpf.usermodel.Paragraph

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.