Package gov.lanl.adore.djatoka.plugin

Source Code of gov.lanl.adore.djatoka.plugin.ExtractPDF

/*
* Copyright (c) 2010 Brasiliana Digital Library (http://brasiliana.usp.br).
* Based on similar source code from Adore Djatoka.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/

package gov.lanl.adore.djatoka.plugin;

import gov.lanl.adore.djatoka.DjatokaDecodeParam;
import gov.lanl.adore.djatoka.DjatokaException;
import gov.lanl.adore.djatoka.IExtract;
import gov.lanl.adore.djatoka.openurl.OpenURLJP2KService;
import gov.lanl.adore.djatoka.util.IOUtils;
import gov.lanl.adore.djatoka.util.ImageProcessingUtils;
import gov.lanl.adore.djatoka.util.ImageRecord;

import java.awt.Color;
import java.awt.Graphics2D;
import java.awt.Rectangle;
import java.awt.image.BufferedImage;
import java.awt.Dimension;
import java.awt.Rectangle;

import javax.imageio.ImageIO;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Map;
import java.util.HashMap;
import java.util.Properties;

import java.util.Enumeration;
import java.util.StringTokenizer;
import java.util.logging.Level;

import org.apache.log4j.Logger;

import gov.lanl.util.ConfigurationManager;


/**
* Uses Poppler PDF commands to extract PDF pages to PNG files.
* @author Fabio N. Kepler
*/
public class ExtractPDF implements IExtract {

    private static Logger logger = Logger.getLogger(ExtractPDF.class);

    // maximum size of either preview image dimension
    private static final int MAX_PX = 1000;

    // maxium DPI
    private static final int MAX_DPI = 180;

    private static int DEFAULT_DENSITY = 72;
    //private static String DEFAULT_COLORSPACE = "RGB";
    private static int DEFAULT_LEVELS = 4;

    // command to get image from PDF; @FILE@, @OUTPUT@ etc are placeholders
    private static final String PDFTOPPM_COMMAND[] =
    {
        "@COMMAND@", "-q", "-png", "-f", "@FIRSTPAGE@", "-l", "@LASTPAGE@",
        "-r", "@DPI@", "@FILE@", "@OUTPUTFILE@"
    };
    private static final int PDFTOPPM_COMMAND_POSITION_BIN = 0;
    private static final int PDFTOPPM_COMMAND_POSITION_FIRSTPAGE = 4;
    private static final int PDFTOPPM_COMMAND_POSITION_LASTPAGE = 6;
    private static final int PDFTOPPM_COMMAND_POSITION_DPI = 8;
    private static final int PDFTOPPM_COMMAND_POSITION_OPTIONAL_EXTRAS = 9; // Must insert at this position instead of just setting it.
    private static final int PDFTOPPM_COMMAND_POSITION_FILE = 9;
    private static final int PDFTOPPM_COMMAND_POSITION_OUTPUTFILE = 10;
   

    // command to get image from PDF; @FILE@, @OUTPUT@ etc are placeholders
    private static final String PDFINFO_COMMAND[] =
    {
        "@COMMAND@", "-f", "@FIRSTPAGE@", "-l", "@LASTPAGE@", "-box", "@FILE@"
    };
    private static final int PDFINFO_COMMAND_POSITION_BIN = 0;
    private static final int PDFINFO_COMMAND_POSITION_FIRSTPAGE = 2;
    private static final int PDFINFO_COMMAND_POSITION_LASTPAGE = 4;
    private static final int PDFINFO_COMMAND_POSITION_FILE = 6;


    // executable path for "pdftoppm", comes from DSpace config at runtime.
    private static String pdftoppmPath = null;

    // executable path for "pdfinfo", comes from DSpace config at runtime.
    private static String pdfinfoPath = null;

    // match line in pdfinfo output that describes file's MediaBox
    private static final Pattern MEDIABOX_PATT = Pattern.compile(
        //"^Page\\s+(\\d+)\\s+size:\\s+([\\.\\d]+)\\s+x\\s+([\\.\\d]+)\\s+pts\\.*$"); // Does not seem to match "Page   41 size: 595 x 842 pts (A4)".
        "^Page\\s+(\\d+)\\s+MediaBox:\\s+([\\.\\d-]+)\\s+([\\.\\d-]+)\\s+([\\.\\d-]+)\\s+([\\.\\d-]+)"); // For use with -box switch
    /* Without the -box switch:
        Page    1 size: 444.72 x 771.12 pts
        Page    2 size: 416.16 x 743.52 pts
    */
    /* With -box switch:
        Page    1 size: 444.72 x 771.12 pts
        Page    2 size: 416.16 x 743.52 pts
        Page    1 MediaBox:     0.00     0.00   444.72   771.12
        Page    1 CropBox:      0.00     0.00   444.72   771.12
        Page    1 BleedBox:     0.00     0.00   444.72   771.12
        Page    1 TrimBox:      0.00     0.00   444.72   771.12
        Page    1 ArtBox:       0.00     0.00   444.72   771.12
        Page    2 MediaBox:     0.00     0.00   416.16   743.52
        Page    2 CropBox:      0.00     0.00   416.16   743.52
        Page    2 BleedBox:     0.00     0.00   416.16   743.52
        Page    2 TrimBox:      0.00     0.00   416.16   743.52
        Page    2 ArtBox:       0.00     0.00   416.16   743.52
    */

    // match line in pdfinfo output that describes file's MediaBox
    private static final Pattern PAGES_PATT = Pattern.compile(
        "^Pages:\\s+([\\d-]+)");


    private static Properties props = new Properties();

    private static final String DEFAULT_PDFTOPPM_PATH = "/usr/bin/pdftoppm";
    private static final String DEFAULT_PDFINFO_PATH = "/usr/bin/pdfinfo";

    private static final String PROPS_PDF_PDFTOPPM_PATH = "PDF.pdftoppmPath";
    private static final String PROPS_PDF_PDFINFO_PATH = "PDF.pdfinfoPath";


  /**
   * Returns PDF props in ImageRecord
   * @param r ImageRecord containing absolute file path of PDF file.
   * @return a populated ImageRecord object
   * @throws DjatokaException
   */
    @Override
  public final ImageRecord getMetadata(ImageRecord r) throws DjatokaException {
    if ((r.getImageFile() == null || !new File(r.getImageFile()).exists()) && r.getObject() == null)
      throw new DjatokaException("Image Does Not Exist: " + r.toString());
    logger.debug("Get metadata: " + r.toString());
        try {
            DjatokaDecodeParam params = new DjatokaDecodeParam();
            BufferedImage bi = process(r, params);

      r.setWidth(bi.getWidth());
      r.setHeight(bi.getHeight());
            r.setDWTLevels(DEFAULT_LEVELS);
            r.setLevels(DEFAULT_LEVELS);
            r.setBitDepth(bi.getColorModel().getPixelSize());
            r.setNumChannels(bi.getColorModel().getNumColorComponents());
           
            //r.setCompositingLayerCount(getNumberOfPages(r)); // Semantics: number of pages in the PDF file.
            HashMap<String, String> pdfProps = (HashMap<String, String>)getPDFProperties(r);
            int n = Integer.parseInt(pdfProps.remove("Pages"));
            r.setCompositingLayerCount(n);
           
            // Since it is not possible for the viewer to query about a specific page's width and height
            // (because in Djatoka's point of view a PDF is just one image with various compositing layers, which are the pages),
            // at this point right here we query the PDF file about the size of all pages and store this
            // information in a Map. This map can be returned by getMetadata by setting it as the instProps member of the
            // ImageRecord class, which Djatoka already implements and which is returned as JSON to the viewer JS.
            // The viewer then has to store this information and later query it instead of asking Djatoka (getMetadata) again.
            //Map<String, String> instProps = getPagesSizes(r);
            r.setInstProps(pdfProps);
            logger.debug("instProps: " + r.getInstProps());

            logger.debug("Get metadata: "+r.toString());
    } catch (Exception e) {
      throw new DjatokaException(e);
    }

    return r;
  }


//*
    public final ImageRecord getMetadata(BufferedImage bi) throws DjatokaException {
    if (bi == null)
      throw new DjatokaException("Image Does Not Exist");

        logger.debug("getMetadata(BufferedImage): " + bi.getWidth());
        try {
            ImageRecord r = new ImageRecord();

      r.setWidth(bi.getWidth());
      r.setHeight(bi.getHeight());

            r.setDWTLevels(DEFAULT_LEVELS);
            r.setLevels(DEFAULT_LEVELS);

            r.setBitDepth(bi.getColorModel().getPixelSize());
            r.setNumChannels(bi.getColorModel().getNumColorComponents());
            //r.setCompositingLayerCount(getNumberOfPages(r)); // 'bi' refers to just one page extracted from the PDF file.
            //logger.debug("r2: "+r.toString());
           
            //TODO
           
            return r;
    } catch (Exception e) {
      throw new DjatokaException(e);
    }
  }
//*/

    @Override
    // TODO
    // FIXME
  public final String[] getXMLBox(ImageRecord r) throws DjatokaException {
    String[] xml = null;
    try {
      if (r.getImageFile() == null && r.getObject() != null
          && r.getObject() instanceof InputStream) {
//        xml = new JP2ImageInfo((InputStream) r.getObject()).getXmlDocs();
      } else {
//        xml = new JP2ImageInfo(new File(r.getImageFile())).getXmlDocs();
      }
    } catch (Exception e) {
      logger.error(e, e);
    }
    return xml;
  }

 
  /**
   * Extracts region defined in DjatokaDecodeParam as BufferedImage
   * @param input absolute file path of PDF file.
   * @param params DjatokaDecodeParam instance containing region and transform settings.
   * @return extracted region as a BufferedImage
   * @throws DjatokaException
   */
    @Override
  public BufferedImage process(String input, DjatokaDecodeParam params)
      throws DjatokaException {
     
    logger.debug("ExtractPDF.process:\n\tinput: " + input + "\n\tparams: " + params);
   
        if (input == null)
            throw new DjatokaException("Unknown failure while converting file: no image produced.");
           
    try {
        setPDFCommandsPath();
      } catch (IllegalStateException e) {
            logger.error("Failed to set PDF commands path: ",e);
            throw e;
        }
     
        int page_number = 1 + params.getCompositingLayer(); // From 0-based to 1-based.
        int status = 0;
        BufferedImage processedImage = null;
        try
        {
            /*
            // First get max physical dim of bounding box of the page
            // to compute the DPI to ask for..  otherwise some AutoCAD
            // drawings can produce enormous files even at 75dpi, for
            // 48" drawings..
            int dpi = 0;
            Dimension pageSize = getPDFPageSize(input, page_number);
            if (pageSize == null)
            {
                logger.error("Sanity check: Did not find \"Page " + page_number + " size\" line in output of pdfinfo, file="+input);
                throw new IllegalArgumentException("Failed to get \"Page " + page_number + " size\" of PDF with pdfinfo.");
            }
            else
            {
                double w = pageSize.getWidth();
                double h = pageSize.getHeight();
                int maxdim = (int)Math.max(Math.abs(w), Math.abs(h));
                dpi = Math.min(MAX_DPI, (MAX_PX * 72 / maxdim));
                logger.debug("DPI: pdfinfo method got dpi="+dpi+" for max dim="+maxdim+" (points, 1/72\")");
            } */

            // Scale
            int dpi = getScaledDPI(params);

            // Requires Sun JAI imageio additions to read ppm directly.
            // this will get "-[0]+1.ppm" appended to it by pdftoppm
            File outPrefixF = File.createTempFile("pdftopng", "out");
            String outPrefix = outPrefixF.toString();
            outPrefixF.delete();

            //String pdfCmd[] = PDFTOPPM_COMMAND.clone();
            ArrayList<String> pdfCmd = new ArrayList<String>(Arrays.asList(PDFTOPPM_COMMAND));
            pdfCmd.set(PDFTOPPM_COMMAND_POSITION_BIN, pdftoppmPath);
            pdfCmd.set(PDFTOPPM_COMMAND_POSITION_FIRSTPAGE, "" + page_number);
            pdfCmd.set(PDFTOPPM_COMMAND_POSITION_LASTPAGE, "" + page_number);
            pdfCmd.set(PDFTOPPM_COMMAND_POSITION_DPI, String.valueOf(dpi));
            pdfCmd.set(PDFTOPPM_COMMAND_POSITION_FILE, input.toString());
            pdfCmd.set(PDFTOPPM_COMMAND_POSITION_OUTPUTFILE, outPrefix);

            // Crop
            Rectangle crop = getCropParam(params);
            if (crop != null) {
                String[] cropParams = {"-x", ""+(int)crop.getX(), "-y", ""+(int)crop.getY(), "-W", ""+(int)crop.getWidth(), "-H", ""+(int)crop.getHeight()};
                pdfCmd.addAll(PDFTOPPM_COMMAND_POSITION_OPTIONAL_EXTRAS, Arrays.asList(cropParams));
            }

            String[] pdfCmdA = pdfCmd.toArray(new String[pdfCmd.size()]);
            logger.debug("Running pdftoppm command: " + Arrays.deepToString(pdfCmdA));
            //logger.debug("Running pdftoppm command: " + pdfCmd.toString());

            File outf = null;
            Process pdfProc = null;
            try
            {
                pdfProc = Runtime.getRuntime().exec(pdfCmdA);
                status = pdfProc.waitFor();
                logger.debug("status: " + status);
               
                // pdftoppm uses variable numbers of padding 0s to the output prefix.
                // E.g., may be prefix-000001.png, prefix-001.png or even prefix-01.png.
                // Version 0.12.3 (Poppler, not XPDF) seems to consider the total number of pages.
                // So, for example, in a PDF with 90 pages, the output will be "prefix-02.png";
                // for a PDF with 350 pages, the output will be "prefix-002.png".
                // FIXME: try some approach where the PDF number of pages is considered without
                // running pdfinfo command again, thus making it simpler to determine the number
                // of padding zeros. Right now we going "brute force" because we do not know if
                // it is feasable to once again run the pdfinfo command.
                String tests[] = {
                    outPrefix + "-" + page_number + ".png",
                    outPrefix + "-0" + page_number + ".png",
                    outPrefix + "-00" + page_number + ".png",
                    outPrefix + "-000" + page_number + ".png",
                    outPrefix + "-0000" + page_number + ".png",
                    outPrefix + "-00000" + page_number + ".png"
                    };
                for (String outname : tests)
                {
                    if ((new File(outname)).exists())
                    {
                        outf = new File(outname);
                        break;
                    }
                }
                logger.debug("PDFTOPPM output is: "+outf+", exists=" + outf != null ? outf.exists() : "!");
                processedImage = ImageIO.read(outf);
               
                // Rotate
                if (params.getRotationDegree() > 0) {
                    processedImage = ImageProcessingUtils.rotate(processedImage, params.getRotationDegree());
                }
            }
            catch (InterruptedException e)
            {
                logger.error("Failed converting PDF file to image: ", e);
                throw new IllegalArgumentException("Failed converting PDF file to image: ", e);
            }
            finally
            {
                if (outf != null) outf.delete();
                // Our exec() should not produce any output, but we want to stay safe.
                // http://mark.koli.ch/2011/01/leaky-pipes-remember-to-close-your-streams-when-using-javas-runtimegetruntimeexec.html
                org.apache.commons.io.IOUtils.closeQuietly(pdfProc.getOutputStream());
                org.apache.commons.io.IOUtils.closeQuietly(pdfProc.getInputStream());
                org.apache.commons.io.IOUtils.closeQuietly(pdfProc.getErrorStream());
            }
        }
        catch (Exception e)
        {
                logger.error("Failed converting PDF file to image: ", e);
                throw new IllegalArgumentException("Failed converting PDF file to image: ", e);
        }
        finally
        {
            if (status != 0)
                logger.error("PDF conversion proc failed, exit status="+status+", file="+input);
        }

        return processedImage;
  }


    public BufferedImage processUsingTemp(InputStream input, DjatokaDecodeParam params)
            throws DjatokaException {
        File in;
        // Copy to tmp file
        try {
            String cacheDir = OpenURLJP2KService.getCacheDir();
            if (cacheDir != null) {
                in = File.createTempFile("tmp", ".pdf", new File(cacheDir));
            } else {
                in = File.createTempFile("tmp", ".pdf");
            }
            FileOutputStream fos = new FileOutputStream(in);
            in.deleteOnExit();
            IOUtils.copyStream(input, fos);
        } catch (IOException e) {
            logger.error(e, e);
            throw new DjatokaException(e);
        }

        BufferedImage bi = process(in.getAbsolutePath(), params);

        if (in != null) {
            in.delete();
        }

        return bi;
    }


  /**
   * Extracts region defined in DjatokaDecodeParam as BufferedImage
   * @param input InputStream containing a PDF bitstream.
   * @param params DjatokaDecodeParam instance containing region and transform settings.
   * @return extracted region as a BufferedImage
   * @throws DjatokaException
   */
    @Override
  public BufferedImage process(InputStream input, DjatokaDecodeParam params)
            throws DjatokaException {
        return processUsingTemp(input, params);
    }

 
  /**
   * Extracts region defined in DjatokaDecodeParam as BufferedImage
   * @param input ImageRecord wrapper containing file reference, inputstream, etc.
   * @param params DjatokaDecodeParam instance containing region and transform settings.
   * @return extracted region as a BufferedImage
   * @throws DjatokaException
   */
    @Override
  public BufferedImage process(ImageRecord input, DjatokaDecodeParam params)
      throws DjatokaException {
        logger.debug("in imagerecord;");
    if (input.getImageFile() != null)
      return process(input.getImageFile(), params);
    else if (input.getObject() != null
        && (input.getObject() instanceof InputStream))
      return process((InputStream) input.getObject(), params);
    else
      throw new DjatokaException(
          "File not defined and Input Object Type "
              + input.getObject().getClass().getName()
              + " is not supported");
  }


    /**
     * Get PDF information with pdfinfo:
     * - "Pages: X": number of pages;
     * - "Page X size: www.ww hhh.hh": size of each page, in pts.
     * @returns a map:
     * - [Pages][n]
     * - [Page 1][111.11 222.22]
     * - [Page i][www.ww hhh.hh]
     * - [Page n][999.99 1000.00]
     */
    private static Map<String, String> getPDFProperties(ImageRecord input) throws DjatokaException {
        logger.debug("Getting PDF info");

    try {
        setPDFCommandsPath();
      } catch (IllegalStateException e) {
            logger.error("Failed to set PDF commands path: ",e);
            throw e;
        }
       
        HashMap<String, String> pdfProperties = new HashMap<String, String>();
       
        String sourcePath = null;

        if (input.getImageFile() != null) {
            logger.debug("PDFInfo image file: " + input.getImageFile());
            sourcePath = input.getImageFile();
        } else if (input.getObject() != null && (input.getObject() instanceof InputStream)) {
            FileInputStream fis = null;
            fis = (FileInputStream) input.getObject();
            File in;
           
            // Copy to tmp file
            try {
                String cacheDir = OpenURLJP2KService.getCacheDir();
                if (cacheDir != null) {
                    in = File.createTempFile("tmp", ".pdf", new File(cacheDir));
                } else {
                    in = File.createTempFile("tmp", ".pdf");
                }
                in.deleteOnExit();

                FileOutputStream fos = new FileOutputStream(in);
                IOUtils.copyStream(fis, fos);
            } catch (IOException e) {
                logger.error(e, e);
                throw new DjatokaException(e);
            }
            sourcePath = in.getAbsolutePath();
        } else {
            throw new DjatokaException(
                    "File not defined and Input Object Type "
                    + input //.getObject().getClass().getName()
                    + " is not supported");
        }
       
        String pdfinfoCmd[] = PDFINFO_COMMAND.clone();
        pdfinfoCmd[PDFINFO_COMMAND_POSITION_BIN] = pdfinfoPath;
        pdfinfoCmd[PDFINFO_COMMAND_POSITION_FIRSTPAGE] = "1";
        pdfinfoCmd[PDFINFO_COMMAND_POSITION_LASTPAGE] = "-1"; // Last page even we not knowing its number.
        pdfinfoCmd[PDFINFO_COMMAND_POSITION_FILE] = sourcePath;
        Process pdfProc = null;
        try
        {
            ArrayList<MatchResult> pageSizes = new ArrayList<MatchResult>();
            MatchResult pages = null;
           
            pdfProc = Runtime.getRuntime().exec(pdfinfoCmd);
            BufferedReader lr = new BufferedReader(new InputStreamReader(pdfProc.getInputStream()));
            String line;
            for (line = lr.readLine(); line != null; line = lr.readLine())
            {
                Matcher mm1 = PAGES_PATT.matcher(line);
                if (mm1.matches())
                    pages = mm1.toMatchResult();
                Matcher mm2 = MEDIABOX_PATT.matcher(line);
                if (mm2.matches())
                    pageSizes.add(mm2.toMatchResult());
            }

            int istatus = pdfProc.waitFor();
            if (istatus != 0)
                logger.error("pdfinfo proc failed, exit status=" + istatus + ", file=" + sourcePath);
               
            if (pages == null)
            {
                logger.error("Did not find 'Pages' line in output of pdfinfo command: " + Arrays.deepToString(pdfinfoCmd));
                pdfProperties.put("Pages", "0");
            }
            else
            {
                //int n = Integer.parseInteger(pages.group(1));
                pdfProperties.put("Pages", pages.group(1));
            }
           
            if (pageSizes.isEmpty())
            {
                logger.error("Did not find \"Page X size\" lines in output of pdfinfo command: " + Arrays.deepToString(pdfinfoCmd));
                throw new IllegalArgumentException("Failed to get pages size of PDF with pdfinfo.");
            }
            else
            {
                for (MatchResult mr : pageSizes)
                {
                    String page = mr.group(1);
                   
                    float x0 = Float.parseFloat(mr.group(2));
                    float y0 = Float.parseFloat(mr.group(3));
                    float x1 = Float.parseFloat(mr.group(4));
                    float y1 = Float.parseFloat(mr.group(5));
                    float w = Math.abs(x1 - x0);
                    float h = Math.abs(y1 - y0);
                    // Have to scale page sizes by max dpi (MAX_DPI / DEFAULT_DENSITY). Otherwise, BookReader.js will request the wrong zoom level (svc.level).
                    float ws = w * MAX_DPI / DEFAULT_DENSITY;
                    float hs = h * MAX_DPI / DEFAULT_DENSITY;
                    String width = "" + ws; //mr.group(2);
                    String height = "" + hs; //mr.group(3);
                    pdfProperties.put("Page " + page, width + " " + height);
                }
            }
           
        } catch (Exception e) {
            logger.error("Failed getting PDF information: ", e);
            throw new DjatokaException("Failed getting PDF information: ", e);
        } finally {
            // Our exec() should just consume one of the streams, but we want to stay safe.
            // http://mark.koli.ch/2011/01/leaky-pipes-remember-to-close-your-streams-when-using-javas-runtimegetruntimeexec.html
            org.apache.commons.io.IOUtils.closeQuietly(pdfProc.getOutputStream());
            org.apache.commons.io.IOUtils.closeQuietly(pdfProc.getInputStream());
            org.apache.commons.io.IOUtils.closeQuietly(pdfProc.getErrorStream());
        }
       
        return pdfProperties;
    }

/*
    private static Dimension getPDFPageSize(String source, int page_number) throws DjatokaException {
        logger.debug("Getting PDF info for size of page '" + page_number + "'.");
       
        Dimension pageDimension = null;

    try {
        setPDFCommandsPath();
      } catch (IllegalStateException e) {
            logger.error("Failed to set PDF commands path: ",e);
            throw e;
        }
       
        String pdfinfoCmd[] = PDFINFO_COMMAND.clone();
        pdfinfoCmd[PDFINFO_COMMAND_POSITION_BIN] = pdfinfoPath;
        pdfinfoCmd[PDFINFO_COMMAND_POSITION_FIRSTPAGE] = "" + page_number;
        pdfinfoCmd[PDFINFO_COMMAND_POSITION_LASTPAGE] = "" + page_number; // Last page even we not knowing its number.
        pdfinfoCmd[PDFINFO_COMMAND_POSITION_FILE] = source;
        Process pdfProc = null;
        try
        {
            MatchResult pageSize = null;;
           
            pdfProc = Runtime.getRuntime().exec(pdfinfoCmd);
            BufferedReader lr = new BufferedReader(new InputStreamReader(pdfProc.getInputStream()));
            String line;
            for (line = lr.readLine(); line != null; line = lr.readLine())
            {
                Matcher mm = MEDIABOX_PATT.matcher(line);
                if (mm.matches())
                    pageSize = mm.toMatchResult();
            }

            int istatus = pdfProc.waitFor();
            if (istatus != 0)
                logger.error("pdfinfo proc failed, exit status=" + istatus + ", file=" + source);
               
            if (pageSize == null)
            {
                logger.error("Did not find 'Page " + page_number + " size' line in output of pdfinfo command: " + pdfinfoCmd);
                //throw new IllegalArgumentException("Failed to get pages size of PDF with pdfinfo.");
                pageDimension = new Dimension(0, 0);
            }
            else
            {
                String page = pageSize.group(1);
                double x0 = Double.parseDouble(pageSize.group(2));
                double y0 = Double.parseDouble(pageSize.group(3));
                double x1 = Double.parseDouble(pageSize.group(4));
                double y1 = Double.parseDouble(pageSize.group(5));
                double width = Math.abs(x1 - x0);
                double height = Math.abs(y1 - y0);
                pageDimension = new Dimension();
                pageDimension.setSize(width, height);
            }
           
        }
        catch (Exception e)
        {
            logger.error("Failed getting PDF page size: ", e);
            throw new DjatokaException("Failed getting PDF page size: ", e);
        } finally {
            // Our exec() should just consume one of the streams, but we want to stay safe.
            // http://mark.koli.ch/2011/01/leaky-pipes-remember-to-close-your-streams-when-using-javas-runtimegetruntimeexec.html
            org.apache.commons.io.IOUtils.closeQuietly(pdfProc.getOutputStream());
            org.apache.commons.io.IOUtils.closeQuietly(pdfProc.getInputStream());
            org.apache.commons.io.IOUtils.closeQuietly(pdfProc.getErrorStream());
        }
        return pageDimension;
    }
*/

    private int getScaledDPI(DjatokaDecodeParam params) {
        if (params.getLevel() >= 0) {
            int levels = DEFAULT_LEVELS;
            int reduce = levels - params.getLevel();
            params.setLevelReductionFactor((reduce >= 0) ? reduce : 0);
        } else if (params.getLevel() == -1 && params.getRegion() == null && params.getScalingDimensions() != null) {
            int width = params.getScalingDimensions()[0];
            int height = params.getScalingDimensions()[1];
            int levels = DEFAULT_LEVELS;
            int scale_level = Math.min(MAX_DPI, (MAX_PX * DEFAULT_DENSITY / Math.max(Math.abs(width), Math.abs(height))));
            int reduce = levels - scale_level;
            params.setLevelReductionFactor((reduce >= 0) ? reduce : 0);           
        }
        if (params.getLevelReductionFactor() > 0) {
            int reduce = 1 << params.getLevelReductionFactor(); // => image.size() / 2^r: reduce 0 means image/1, reduce 1 means image/2, etc.
            double s = 1.0 / reduce;
            return (int)(MAX_DPI * s);
        }
        return MAX_DPI;
    }   
   

    private Rectangle getCropParam(DjatokaDecodeParam params) {
        if (params.getRegion() != null) {
            StringTokenizer st = new StringTokenizer(params.getRegion(), "{},");
            String token;
            logger.debug("Region params: " + params.getRegion());
            int x, y, w, h;
            // top
            if ((token = st.nextToken()).contains(".")) {
                y = Integer.parseInt(token);
            } else {
                y = Integer.parseInt(token);
            }
            // left
            if ((token = st.nextToken()).contains(".")) {
                x = Integer.parseInt(token);
            } else {
                x = Integer.parseInt(token);
            }
            // height
            if ((token = st.nextToken()).contains(".")) {
                h = Integer.parseInt(token);
            } else {
                h = Integer.parseInt(token);
            }
            // width
            if ((token = st.nextToken()).contains(".")) {
                w = Integer.parseInt(token);
            } else {
                w = Integer.parseInt(token);
            }
           
            return new Rectangle(x, y, w, h);
        }

        return null;
    }

   
    private static void setPDFCommandsPath() throws IllegalStateException {
        // sanity check: poppler paths are required. can cache since it won't change
        if (pdftoppmPath == null || pdfinfoPath == null)
        {
            //props = IOUtils.loadConfigByCP(classConfig.getArg("props"));

            pdftoppmPath = ConfigurationManager.getProperty(PROPS_PDF_PDFTOPPM_PATH, DEFAULT_PDFTOPPM_PATH);
            pdfinfoPath = ConfigurationManager.getProperty(PROPS_PDF_PDFINFO_PATH, DEFAULT_PDFINFO_PATH);

            if (pdftoppmPath == null)
                throw new IllegalStateException("No value for key \"" + PROPS_PDF_PDFTOPPM_PATH + "\" in djatoka.properties! Should be path to pdftoppm executable.");
            if (pdfinfoPath == null)
                throw new IllegalStateException("No value for key \"" + PROPS_PDF_PDFINFO_PATH + "\" in djatoka.properties! Should be path to pdfinfo executable.");
        }
    }
}
TOP

Related Classes of gov.lanl.adore.djatoka.plugin.ExtractPDF

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.