Package org.apache.poi.hwpf.extractor

Examples of org.apache.poi.hwpf.extractor.WordExtractor


     * Bug 45473 - HWPF cannot read file after save
     */
    public void test45473()
    {
        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug45473.doc" );
        String text1 = new WordExtractor( doc1 ).getText().trim();

        HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack( doc1 );
        String text2 = new WordExtractor( doc2 ).getText().trim();

        // the text in the saved document has some differences in line
        // separators but we tolerate that
        assertEquals( text1.replaceAll( "\n", "" ), text2.replaceAll( "\n", "" ) );
    }
View Full Code Here


     * missing
     */
    public void test46817()
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug46817.doc" );
        WordExtractor extractor = new WordExtractor( doc );
        String text = extractor.getText().trim();

        assertTrue( text.contains( "Nazwa wykonawcy" ) );
        assertTrue( text.contains( "kujawsko-pomorskie" ) );
        assertTrue( text.contains( "ekomel@ekomel.com.pl" ) );
    }
View Full Code Here

     * @throws IOException
     */
    public void test47286() throws IOException
    {
        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug47286.doc" );
        String text1 = new WordExtractor( doc1 ).getText().trim();

        HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack( doc1 );
        String text2 = new WordExtractor( doc2 ).getText().trim();

        // the text in the saved document has some differences in line
        // separators but we tolerate that
        assertEquals( text1.replaceAll( "\n", "" ), text2.replaceAll( "\n", "" ) );

View Full Code Here

     * some website as an embedded object
     */
    public void test47731() throws Exception
    {
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" );
        String foundText = new WordExtractor( doc ).getText();

        assertTrue( foundText
                .contains( "Soak the rice in water for three to four hours" ) );
    }
View Full Code Here

    public void test47742() throws Exception
    {

        // (1) extract text from MS Word document via POI
        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47742.doc" );
        String foundText = new WordExtractor( doc ).getText();

        // (2) read text from text document (retrieved by saving the word
        // document as text file using encoding UTF-8)
        InputStream is = POIDataSamples.getDocumentInstance()
                .openResourceAsStream( "Bug47742-text.txt" );
View Full Code Here

        // TODO: refactor into something nicer!
        if(System.getProperty("poi.test.remote") != null) {
            String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
            HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href );

            WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
            wordExtractor.getText();
        }

    }
View Full Code Here

       
        // Create a document for this file
        HWPFDocument doc = new HWPFDocument(fs);

        // Create a WordExtractor to read the text of the word document
        WordExtractor we = new WordExtractor(doc);

        // Extract all paragraphs in the document as strings
        text = we.getText();

        // Output the document
        if(logger.isInfoEnabled())
          logger.info("Word Document has " + text.length() + " chars\n\n" + text);
      }
View Full Code Here

     
      if(entry.getName().equals("Workbook")) {
        return new ExcelExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("WordDocument")) {
        return new WordExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("PowerPoint Document")) {
        return new PowerPointExtractor(poifsDir, fs);
      }
      if(entry.getName().equals("VisioDocument")) {
View Full Code Here

TOP

Related Classes of org.apache.poi.hwpf.extractor.WordExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.