Package org.pdfclown.samples.cli

Source Code of org.pdfclown.samples.cli.TextInfoExtractionSample

package org.pdfclown.samples.cli;

import org.pdfclown.documents.Document;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ContentScanner;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
import org.pdfclown.documents.contents.composition.PrimitiveComposer;
import org.pdfclown.documents.contents.objects.ContainerObject;
import org.pdfclown.documents.contents.objects.ContentObject;
import org.pdfclown.documents.contents.objects.Text;
import org.pdfclown.files.File;
import org.pdfclown.tools.PageStamper;

import java.awt.geom.Rectangle2D;

/**
  This sample demonstrates <b>how to retrieve text content along with its graphic attributes</b>
  (font, font size, text color, text rendering mode, text bounding box...) from a PDF document;
  it also generates a document version decorated by text bounding boxes.

  @author Stefano Chizzolini (http://www.stefanochizzolini.it)
  @since 0.0.8
  @version 0.1.0
*/
public class TextInfoExtractionSample
  extends Sample
{
  private DeviceRGBColor[] textCharBoxColors = new DeviceRGBColor[]
    {
      new DeviceRGBColor(200f/255,100f/255,100f/255),
      new DeviceRGBColor(100f/255,200f/255,100f/255),
      new DeviceRGBColor(100f/255,100f/255,200f/255)
    };
  private DeviceRGBColor textStringBoxColor = DeviceRGBColor.Black;

  @Override
  public boolean run(
    )
  {
    String filePath = promptPdfFileChoice("Please select a PDF file");

    // 1. Open the PDF file!
    File file;
    try
    {file = new File(filePath);}
    catch(Exception e)
    {throw new RuntimeException(filePath + " file access error.",e);}

    Document document = file.getDocument();
   
    PageStamper stamper = new PageStamper(); // NOTE: Page stamper is used to draw contents on existing pages.
   
    // 2. Iterating through the document pages...
    for(Page page : document.getPages())
    {
      System.out.println("\nScanning page " + (page.getIndex()+1) + "...\n");

      stamper.setPage(page);

      extract(
        new ContentScanner(page), // Wraps the page contents into a scanner.
        stamper.getForeground()
        );

      stamper.flush();
    }

    // 3. Decorated version serialization.
    serialize(file,false);
   
    return true;
  }

  /**
    Scans a content level looking for text.
  */
  /*
    NOTE: Page contents are represented by a sequence of content objects,
    possibly nested into multiple levels.
  */
  private void extract(
    ContentScanner level,
    PrimitiveComposer composer
    )
  {
    if(level == null)
      return;

    while(level.moveNext())
    {
      ContentObject content = level.getCurrent();
      if(content instanceof Text)
      {
        ContentScanner.TextWrapper text = (ContentScanner.TextWrapper)level.getCurrentWrapper();
        int colorIndex = 0;
        for(ContentScanner.TextStringWrapper textString : text.getTextStrings())
        {
          Rectangle2D textStringBox = textString.getBox();
          System.out.println(
            "Text ["
              + "x:" + Math.round(textStringBox.getX()) + ","
              + "y:" + Math.round(textStringBox.getY()) + ","
              + "w:" + Math.round(textStringBox.getWidth()) + ","
              + "h:" + Math.round(textStringBox.getHeight())
              + "] [font size:" + Math.round(textString.getStyle().getFontSize()) + "]: " + textString.getText()
            );

          // Drawing text character bounding boxes...
          colorIndex = (colorIndex + 1) % textCharBoxColors.length;
          composer.setStrokeColor(textCharBoxColors[colorIndex]);
          for(TextChar textChar : textString.getTextChars())
          {
            /*
              NOTE: You can get further text information
              (font, font size, text color, text rendering mode)
              through textChar.style.
            */
            composer.drawRectangle(textChar.getBox());
            composer.stroke();
          }
         
          // Drawing text string bounding box...
          composer.beginLocalState();
          composer.setLineDash(0, 5);
          composer.setStrokeColor(textStringBoxColor);
          composer.drawRectangle(textString.getBox());
          composer.stroke();
          composer.end();
        }
      }
      else if(content instanceof ContainerObject)
      {
        // Scan the inner level!
        extract(level.getChildLevel(),composer);
      }
    }
  }
}
TOP

Related Classes of org.pdfclown.samples.cli.TextInfoExtractionSample

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.