Package org.apache.nutch.parse.pdf

Source Code of org.apache.nutch.parse.pdf.PdfParser

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.parse.pdf;

import org.pdfbox.encryption.DocumentEncryption;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;

import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;

// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.nutch.protocol.Content;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.util.LogUtil;

import java.text.SimpleDateFormat;
import java.util.Calendar;

import java.io.ByteArrayInputStream;
import java.io.IOException;


// TODO MC
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.FileOutputStream;
// TODO MC


/*********************************************
* parser for mime type application/pdf.
* It is based on org.pdfbox.*. We have to see how well it does the job.
*
* @author John Xing
*
* Note on 20040614 by Xing:
* Some codes are stacked here for convenience (see inline comments).
* They may be moved to more appropriate places when new codebase
* stabilizes, especially after code for indexing is written.
*
*********************************************/

public class PdfParser implements Parser {
  public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.pdf");
  private Configuration conf;

  public Parse getParse(Content content) {

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;
    Metadata metadata = new Metadata();

    try {

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }

      // TODO MC - store pdf files to analyze
      // FileOutputStream fout = new FileOutputStream("/home/nutchwax/lixo/"+System.currentTimeMillis()+".pdf");
      // fout.write(raw);
      // fout.close();
      // TODO MC

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse(getConf());
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));       
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }

    if (text == null)
      text = "";

    if (title == null)
      title = "";

    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());

    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }

  // format date
  // currently not used. please keep it for future use.
  private String formatDate(Calendar date) {
    String retval = null;
    if(date != null) {
      SimpleDateFormat formatter = new SimpleDateFormat();
      retval = formatter.format(date.getTime());
    }
    return retval;
  }

  public void setConf(Configuration conf) {
    this.conf = conf;
  }

  public Configuration getConf() {
    return this.conf;
  }


// TODO MC - a partir daqui para testes
  /**
   * Main for testing
   */
  public static void main(String args[]) { 
  String file = args[0];
  System.out.println("File="+file);

        try {
            Configuration conf = NutchConfiguration.create();

      byte[] raw = getRawBytes(new File(file));
            Metadata meta = new Metadata();
            Content content = new Content(file, file, raw, "application/pdf", meta, conf);

      //Protocol protocol = new ProtocolFactory(conf).getProtocol(file);
      //Content content = protocol.getProtocolOutput(new Text(file), new CrawlDatum()).getContent();
      //Parse parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);

      PdfParser parser=new PdfParser();
      System.out.println("TEXT:\n"+parser.getParse(content).getText());               
       //System.out.println("TEXT:\n"+parse.getText());
       System.out.println("METADATA:\n"+meta);
  }
  catch(Exception e) {
      e.printStackTrace();
  }
  }

  private final static byte[] getRawBytes(File f) {
  try {
      if (!f.exists())
    return null;
      FileInputStream fin = new FileInputStream(f);
      byte[] buffer = new byte[(int) f.length()];
      fin.read(buffer);
      fin.close();
      return buffer;
  } catch (Exception err) {
      err.printStackTrace(LogUtil.getErrorStream(LOG));
      return null;
  }
  }

}
TOP

Related Classes of org.apache.nutch.parse.pdf.PdfParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.