Package it.unimi.dsi.mg4j.document

Source Code of it.unimi.dsi.mg4j.document.PdfDocumentFactory

package it.unimi.dsi.mg4j.document;

/*    
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2005-2010 Sebastiano Vigna
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 3 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/

import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.util.Properties;

import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.io.ObjectInputStream;
import java.io.PipedReader;
import java.io.PipedWriter;

import org.apache.commons.configuration.ConfigurationException;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;


/** A factory that converts PDF (Portable Document Format) documents into text.
* Presently this class is very inefficient; it is mainly useful for debugging
* and exemplification purposes.
*/

public class PdfDocumentFactory extends PropertyBasedDocumentFactory {
  private static final long serialVersionUID = 1L;

  /** Case-insensitive keys for metadata.
   *
   *  @see PropertyBasedDocumentFactory.MetadataKeys
   */
  public static enum MetadataKeys {
    /** A property specifying that the factory should use the first line of text as a title (not implemented). */
    PARSETITLE,
  }

  /** A PDF text stripper that will be used to extract text from PDF documents. */
  private transient PDFTextStripper textStripper;
  /** The word reader used for all documents. */
  private final WordReader wordReader;

  protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws ConfigurationException {
    if ( sameKey( MetadataKeys.PARSETITLE, key ) ) {
      /*metadata.put( PARSE_TITLE, value );
      return true;*/
      throw new ConfigurationException( "PARSETITLE is not yet implemented" );
    }
   
    return super.parseProperty( key, values, metadata );
  }

  public PdfDocumentFactory() throws IOException {
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }
 
  public PdfDocumentFactory( final Properties properties ) throws IOException, ConfigurationException {
    super( properties );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }

  public PdfDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) throws IOException {
    super( defaultMetadata );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }

  public PdfDocumentFactory( final String[] property ) throws IOException, ConfigurationException {
    super( property );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }

  public PdfDocumentFactory copy() {
    try {
      return new PdfDocumentFactory( defaultMetadata );
    }
    catch ( IOException e ) {
      throw new RuntimeException( e );
    }
  }
 
  public int numberOfFields() {
    return 1;
  }
 
  public String fieldName( final int field ) {
    ensureFieldIndex( field );
    return "text";
  }
 
  public int fieldIndex( final String fieldName ) {
    return "text".equals( fieldName ) ? 0: -1;
  }
 
  public FieldType fieldType( final int field ) {
    ensureFieldIndex( field );
    return FieldType.TEXT;
  }

  private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {
    s.defaultReadObject();
    textStripper = new PDFTextStripper();
  }
 
  public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) {
    return new AbstractDocument() {
     
      private PDDocument pdfDocument;
      private Thread pipingThread;
      private PipedReader pipedReader;
      private PipedWriter pipedWriter;
     
      public CharSequence title() {
        return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.TITLE, metadata );
      }
     
      public String toString() {
        return title().toString();
      }

      public CharSequence uri() {
        return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.URI, metadata );
      }

      public Object content( final int field ) throws IOException {
        ensureFieldIndex( field );
        pipedReader = new PipedReader();
        pipedWriter = new PipedWriter();
        pdfDocument = PDDocument.load( rawContent );
        pipedWriter.connect( pipedReader );
        pipingThread = new Thread() {
          public void run() {
            try {
              textStripper.writeText( pdfDocument, pipedWriter );
              pipedWriter.close();
              pipedWriter = null;
            }
            catch( InterruptedIOException dontCare ) {}
            catch ( IOException e ) {
              throw new RuntimeException( e );
            }
          }
        };
        pipingThread.start();
        return pipedReader;
      }
     
      public WordReader wordReader( int field ) {
        ensureFieldIndex( field );
        // TODO: should depend on locale or something.
        return wordReader;
      }

      public void close() throws IOException {
        super.close();
        if ( pipingThread != null ) {
          try {
            pipingThread.interrupt();
            pipingThread.join();
            pipingThread = null;
          }
          catch ( InterruptedException e ) {
            throw new RuntimeException( e );
          }
        }

        if ( pipedReader != null ) {
          pipedReader.close();
          pipedReader = null;
        }
        if ( pipedWriter != null ) {
          pipedWriter.close();
          pipedWriter = null;
        }
        if ( pdfDocument != null ) {
          pdfDocument.close();
          pdfDocument = null;
        }
      }
    };
  }
}
TOP

Related Classes of it.unimi.dsi.mg4j.document.PdfDocumentFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.