Package org.pdfbox.pdmodel

Examples of org.pdfbox.pdmodel.PDDocument


  private Configuration conf;

  public ParseResult getParse(Content content) {

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;
    Metadata metadata = new Metadata();

    try {

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
      }

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        //Just try using the default password and move on
        pdf.openProtection(new StandardDecryptionMaterial(""));
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (BadSecurityHandlerException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));       
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }
View Full Code Here


   * @return Reader a reader that is fed to an indexer.
   */
  protected Reader getReader(InputStream docStream)
  {
   
    PDFParser parser = null; PDDocument document = null; PDFTextStripper stripper = null;
    CharArrayWriter writer = null;
    try{
      parser = new PDFParser(docStream);
      parser.parse();
      document = parser.getPDDocument();
      writer = new CharArrayWriter();
      stripper = new PDFTextStripper();
      stripper.setLineSeparator("\n");
      stripper.writeText(document, writer);
      document.close();
      writer.close();
      parser.getDocument().close();
      return new CharArrayReader(writer.toCharArray());
    }catch (Exception e){
        //logger.warn("WARNING: Problem converting PDF: ",e);
      try{
        document.close();       
      }catch(Exception e1){
        //logger.warn("WARNING: Problem converting PDF: ",e1);
      }
      try{
        writer.close();
View Full Code Here

  private Configuration conf;

  public Parse getParse(Content content) {

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;
    Metadata metadata = new Metadata();

    try {

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }

      // TODO MC - store pdf files to analyze
      // FileOutputStream fout = new FileOutputStream("/home/nutchwax/lixo/"+System.currentTimeMillis()+".pdf");
      // fout.write(raw);
      // fout.close();
      // TODO MC

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse(getConf());
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));       
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }
View Full Code Here

   {
      if (is == null)
      {
         throw new NullPointerException("InputStream is null.");
      }
      PDDocument pdDocument = null;
      StringWriter sw = new StringWriter();
      try
      {
         try
         {
            pdDocument = PDDocument.load(is);
         }
         catch (IOException e)
         {
            return "";
         }

         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try
            {
               pdDocument.close();
            }
            catch (IOException e)
            {
            }
         if (is != null)
View Full Code Here

               
            try {
                PDFParser parser = new PDFParser(blob.getStream());
                parser.parse();
   
                PDDocument document = parser.getPDDocument();
   
                CharArrayWriter writer = new CharArrayWriter();
   
                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);
   
                document.close();
                writer.close();
               
                Map result = new HashMap();
                result.put(FieldNames.FULLTEXT, new CharArrayReader(writer.toCharArray()));
                return result;
View Full Code Here

  private Configuration conf;

  public Parse getParse(Content content) {

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;
    Metadata metadata = new Metadata();

    try {

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse(getConf());
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));       
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }
View Full Code Here

      PDFParser parser = new PDFParser(new ByteArrayInputStream(fileData.data));
      parser.parse();
      COSDocument cosDoc = parser.getDocument();

      PDFTextStripper stripper = new PDFTextStripper();
      String docText = stripper.getText(new PDDocument(cosDoc));
      cosDoc.close();

      return new IndexDocument(fileData.path, docText, null);
    } catch (IOException e) {
      String msg = "Failed to write to the index";
View Full Code Here

            PDFParser parser = new PDFParser(is);
            parser.parse();
            COSDocument cosDoc = parser.getDocument();

            PDFTextStripper stripper = new PDFTextStripper();
            String docText = stripper.getText(new PDDocument(cosDoc));
            cosDoc.close();
            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", docText, Field.Store.NO, Field.Index.TOKENIZED));
View Full Code Here

        if (info.getLevel() >= 0) {
            InputStream in = UtilExtract.getStream(info.getUri());

            ByteArrayOutputStream bout = null;
            Writer writer = null;
            PDDocument document = null;

           
            try {

                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");


             
              //load the document
                document = PDDocument.load(in);

                String author = "";
                String title = "";
                String summary = "";

                //get the additional data
                try {
                    PDDocumentInformation pdfinfo = document.getDocumentInformation();

                    if (!Util.isEmpty(pdfinfo.getAuthor())) {
                        author = pdfinfo.getAuthor();
                    }

                    if (!Util.isEmpty(pdfinfo.getTitle())) {
                        title = pdfinfo.getTitle();
                    }

                    if (!Util.isEmpty(pdfinfo.getSubject())) {
                        summary = pdfinfo.getSubject();
                    }
                } catch (Exception eR) {
                    String message = MessageUtil.getMessage("extractor.pdf.metadatamissing",
                            new Object[] { info.getUri() });
                    logger.info(message);
                }

                //set the buffer
                bout = new ByteArrayOutputStream();
                writer = new OutputStreamWriter(bout);

                //strip the document to the buffer
                stripper.writeText(document, writer);
                bout.flush();
                writer.flush();

                //construct the patterns (to not ignore and replace)
                Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
                Pattern replacePattern = Pattern.compile(getReplaceChars());

                NodeStruct node = new NodeStruct();
                ByteArrayInputStream bin = null;

                try {
                    bin = new ByteArrayInputStream(bout.toByteArray());

                    byte[] buffer = new byte[1024];
                    int n = bin.read(buffer);

                    while (n > 0) {
                        String chars = new String(buffer, 0, n);

                        //generate the list of the words for the buffer
                        LinkedList listWords = UtilExtract.getValueList(chars,
                                getMinLengthWord(), notIgnorePattern,
                                replacePattern);

                        for (int j = 0; j < listWords.size(); j++)
                            node.addTuple(TupleStruct.KEYWORD_GENERIC,
                                (String) listWords.get(j));

                        n = bin.read(buffer);
                    }

                    logger.debug("Title is " + title + "Path is :" +
                        info.getUri() + "author" + author + " Summary:" +
                        summary);

                    //set the summary field according to the defualt settings
                    if (summary.length() > getMaxLengthSummary()) {
                        summary = summary.substring(0, getMaxLengthSummary());
                    }

                    DocumStruct doc = new DocumStruct();
                    doc.setTitle(title);
                    doc.setPath(info.getUri());
                    doc.setDescription(summary);
                    doc.setContent(node);
                    doc.setCategoryName(info.getCategoryName());
                    doc.setCategoryLocation(info.getCategoryLocation());

                    //set the pdf -author
                    doc.setAuthor(author);

                    //store and reindex document
                    PluginManager.storeAndAddDocument(doc);
                } catch (IOException e) {
                    logger.debug("Exception in reading the document text" +
                        e.getMessage(), e);
                    throw new RpException("extractor.pdf.textdatamissing",
                        new Object[] { info.getUri() });
                } finally {
                    try {
                        if (bin != null) {
                            bin.close();
                        }
                    } catch (Exception e) {
                    }
                }
            } catch (IOException e) {
                logger.debug("Exception in reading the document text" +
                    e.getMessage(), e);
                throw new RpException("app.extract.error",
                    new Object[] { info.getUri() });
            } finally {
                try {
                    if (writer != null) {
                        writer.close();
                    }

                    if (bout != null) {
                        bout.close();
                    }

                    if (document != null) {
                        document.close();
                    }
                } catch (Exception e) {
                }
            }
        } else {
View Full Code Here

            log.error("parse() filename is null");
            throw new ConverterException("PDFConverter::parse() filename is null");
        }

        // PD Document
        PDDocument document = null;
        Writer output = null;
        try {
            document = getPDDocument();

            // check document is readable
            AccessPermission ap = document.getCurrentAccessPermission();
            if (! ap.canExtractContent()) {
                log.info("parse() Document (" + filename + ") isn't readable for DocSearcher.");
                throw new ConverterException("parse() can't read PDF file");
            }

            // write the text to temp file
            try {
                log.debug("parse() Attempting to extract text from (" + filename + ")");

                output = new StringWriter();

                PDFTextStripper stripper = new PDFTextStripper();
                stripper.writeText(document, output);

                log.debug("parse() Successfully stripped out text from (" + filename + ")");
            }
            catch (IOException ioe) {
                log.error("parse() failed", ioe);
                throw new ConverterException("PDFConverter::parse() failed", ioe);
            }

            // get the meta data
            PDDocumentInformation info = document.getDocumentInformation();
            documentTitle = info.getTitle();
            documentAuthor = info.getAuthor();
            documentKeywords = info.getKeywords();
            if (document != null) {
                documentText = output.toString();
            }
        }
        catch (IOException ioe) {
            log.error("parse() failed", ioe);
            throw new ConverterException("parse() failed", ioe);
        }
        finally {
            // close stream
            IOUtils.closeQuietly(output);

            // close document
            try {
                if (document != null) {
                    document.close();
                }
            }
            catch (IOException ioe) {
                log.fatal("parse() can't close PDDocument", ioe);
            }
View Full Code Here

TOP

Related Classes of org.pdfbox.pdmodel.PDDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.