Examples of org.pdfbox.util.PDFTextStripper

org.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. @author Ben Litchfield @version $Revision: 1.69 $

    
    return super.parseProperty( key, values, metadata );
  }


  public PdfDocumentFactory() throws IOException {
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }

View Full Code Here

    this.wordReader = new FastBufferedReader();
  }
  
  public PdfDocumentFactory( final Properties properties ) throws IOException, ConfigurationException {
    super( properties );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }

View Full Code Here

    this.wordReader = new FastBufferedReader();
  }


  public PdfDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) throws IOException {
    super( defaultMetadata );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }

View Full Code Here

    this.wordReader = new FastBufferedReader();
  }


  public PdfDocumentFactory( final String[] property ) throws IOException, ConfigurationException {
    super( property );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }

View Full Code Here

    return FieldType.TEXT;
  }


  private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {
    s.defaultReadObject();
    textStripper = new PDFTextStripper();
  }

View Full Code Here

  return; }


  //*-- extract PDF document's textual content
  String docText = null;
  try 
  { PDFTextStripper stripper = new PDFTextStripper();
    docText = stripper.getText(new PDDocument(cosDoc));
  }
  catch (OutOfMemoryError exc) 
  { closeCOSDocument(cosDoc);
    logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage()); 
  }

View Full Code Here

    {
        URL url = new URL("http://localhost:8080/xwiki/bin/export/Main/WebHome?format=pdf");
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        InputStream is = connection.getInputStream();
        PDDocument pdd = PDDocument.load(is);
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(pdd);
        pdd.close();
        is.close();


        assertTrue("Invalid content", text.contains("Welcome to your wiki"));
    }

View Full Code Here

      document = PDDocument.load(bis);
      if (document.isEncrypted()) {
        throw new DocumentAccessException("PDF is encrypted. Can not read content file=" + leaf.getName());
      }      
      if (log.isDebug()) log.debug("readContent PDDocument loaded");
      PDFTextStripper stripper = new PDFTextStripper();
      return stripper.getText(document);
    } finally {
      if (document != null) {
        document.close();
      }
      if (bis != null) {

View Full Code Here


            //create a writer where to append the text content.
            StringWriter writer = new StringWriter();
            if( stripper == null )
            {
                stripper = new PDFTextStripper();
            }
            else
            {
                stripper.resetEngine();
            }

View Full Code Here

     * @throws IOException If there is an error creating the test.
     */
    public TestTextStripper( String name ) throws IOException
    {
        super( name );
        stripper = new PDFTextStripper();
        stripper.setLineSeparator("\n");
    }

View Full Code Here

0 1 2 3 4 5

TOP

Related Classes of org.pdfbox.util.PDFTextStripper

br.com.caelum.stella.boleto.transformer.BoletoTransformerIntegrationTest

com.canoo.webtest.plugins.pdftest.htmlunit.pdfbox.PdfBoxPDFPage

com.stimulus.archiva.extraction.PDFExtractor

com.stimulus.archiva.persistence.textextraction.PDFExtractor

de.spotnik.mail.core.message.content.PDFHandler

edu.udo.cs.wvtool.generic.inputfilter.PDFInputFilter

eu.lsem.bakalarka.filetypeprocess.document.PdfDocumentParser

eu.planets_project.services.migration.pdfbox.TextExtractor

it.unimi.dsi.mg4j.document.PdfDocumentFactory

net.fp.rp.search.back.extractor.PdfDataExtractor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.