Examples of org.pdfbox.util.PDFTextStripper

org.pdfbox.util.PDFTextStripper
This class will take a pdf document and strip out all of the text and ignore the formatting and such. @author Ben Litchfield @version $Revision: 1.69 $

    try {
      PDFParser parser = new PDFParser(new ByteArrayInputStream(fileData.data));
      parser.parse();
      COSDocument cosDoc = parser.getDocument();


      PDFTextStripper stripper = new PDFTextStripper();
      String docText = stripper.getText(new PDDocument(cosDoc));
      cosDoc.close();


      return new IndexDocument(fileData.path, docText, null);
    } catch (IOException e) {
      String msg = "Failed to write to the index";

View Full Code Here


            PDFParser parser = new PDFParser(is);
            parser.parse();
            COSDocument cosDoc = parser.getDocument();


            PDFTextStripper stripper = new PDFTextStripper();
            String docText = stripper.getText(new PDDocument(cosDoc));
            cosDoc.close();
            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", docText, Field.Store.NO, Field.Index.TOKENIZED));

View Full Code Here

            PDDocument document = null;


            
            try {


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");




              
              //load the document
                document = PDDocument.load(in);


                String author = "";
                String title = "";
                String summary = "";


                //get the additional data
                try {
                    PDDocumentInformation pdfinfo = document.getDocumentInformation();


                    if (!Util.isEmpty(pdfinfo.getAuthor())) {
                        author = pdfinfo.getAuthor();
                    }


                    if (!Util.isEmpty(pdfinfo.getTitle())) {
                        title = pdfinfo.getTitle();
                    }


                    if (!Util.isEmpty(pdfinfo.getSubject())) {
                        summary = pdfinfo.getSubject();
                    }
                } catch (Exception eR) {
                    String message = MessageUtil.getMessage("extractor.pdf.metadatamissing",
                            new Object[] { info.getUri() });
                    logger.info(message);
                }


                //set the buffer
                bout = new ByteArrayOutputStream();
                writer = new OutputStreamWriter(bout);


                //strip the document to the buffer 
                stripper.writeText(document, writer);
                bout.flush();
                writer.flush();


                //construct the patterns (to not ignore and replace)
                Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());

View Full Code Here

            try {
                log.debug("parse() Attempting to extract text from (" + filename + ")");


                output = new StringWriter();


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.writeText(document, output);


                log.debug("parse() Successfully stripped out text from (" + filename + ")");
            }
            catch (IOException ioe) {
                log.error("parse() failed", ioe);

View Full Code Here

    public Reader convertToPlainText(InputStream source, WVTDocumentInfo d) {


        String plainText = null;
        try {
            PDDocument document = PDDocument.load(source);
            PDFTextStripper stripper = new PDFTextStripper();
            plainText = stripper.getText(document);
            document.close();
        } catch (IOException e) {
            WVToolLogger.getGlobalLogger().logException("Could not read or convert PDF Document", e);
            plainText = new String();
        }

View Full Code Here

        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.

View Full Code Here

        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.

View Full Code Here


            PDDocument document = parser.getPDDocument();


            CharArrayWriter writer = new CharArrayWriter();


            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(document, writer);


            document.close();
            writer.close();


            return new CharArrayReader(writer.toCharArray());

View Full Code Here

    return getTextInternal(startPage, endPage);
  }


  protected String getTextInternal(int startPage, int endPage) {
    try {
      final PDFTextStripper textStripper = new PDFTextStripper();
      textStripper.setStartPage(startPage);
      textStripper.setEndPage(endPage);
      return textStripper.getText(getPDFDocument());
    } catch (final IOException e) {
      throw new RuntimeException("Problem extracting text", e);
    }
  }

View Full Code Here

        final String lineSeparator, final String pageSeparator)
    {
        final StringWriter output = new StringWriter();
        try 
        {
            final PDFTextStripper textStripper = new PDFTextStripper();
            textStripper.setPageSeparator(pageSeparator);
            textStripper.setLineSeparator(lineSeparator);
            textStripper.setStartPage(startPage);
            textStripper.setEndPage(endPage);
            textStripper.writeText(getPDFDocument(), output);
            return output.toString();
        }
        catch (final Exception e) 
        {
            throw new RuntimeException("Error while extracting text from document.", e);

View Full Code Here

0 1 2 3 4 5

TOP

Related Classes of org.pdfbox.util.PDFTextStripper

br.com.caelum.stella.boleto.transformer.BoletoTransformerIntegrationTest

com.canoo.webtest.plugins.pdftest.htmlunit.pdfbox.PdfBoxPDFPage

com.stimulus.archiva.extraction.PDFExtractor

com.stimulus.archiva.persistence.textextraction.PDFExtractor

de.spotnik.mail.core.message.content.PDFHandler

edu.udo.cs.wvtool.generic.inputfilter.PDFInputFilter

eu.lsem.bakalarka.filetypeprocess.document.PdfDocumentParser

eu.planets_project.services.migration.pdfbox.TextExtractor

it.unimi.dsi.mg4j.document.PdfDocumentFactory

net.fp.rp.search.back.extractor.PdfDataExtractor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.