Examples of org.pdfbox.util.PDFTextStripperByArea

org.pdfbox.util.PDFTextStripperByArea
This will extract text from a specified region in the PDF. @author Ben Litchfield @version $Revision: 1.5 $

                    {
                        System.err.println( "Error: Document is encrypted with a password." );
                        System.exit( 1 );
                    }
                }
                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                stripper.setSortByPosition( true );
                Rectangle rect = new Rectangle( 10, 280, 275, 60 );
                stripper.addRegion( "class1", rect );
                List allPages = document.getDocumentCatalog().getAllPages();
                PDPage firstPage = (PDPage)allPages.get( 0 );
                stripper.extractRegions( firstPage );
                System.out.println( "Text in the area:" + rect );
                System.out.println( stripper.getTextForRegion( "class1" ) );
                
            }
            finally
            {
                if( document != null )

View Full Code Here

            {
                doc = PDDocument.load( args[0] );
                List allPages = doc.getDocumentCatalog().getAllPages();
                for( int i=0; i<allPages.size(); i++ )
                {
                    PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                    PDPage page = (PDPage)allPages.get( i );
                    List annotations = page.getAnnotations();
                    //first setup text extraction regions
                    for( int j=0; j<annotations.size(); j++ )
                    {
                        PDAnnotation annot = (PDAnnotation)annotations.get( j );
                        if( annot instanceof PDAnnotationLink )
                        {
                            PDAnnotationLink link = (PDAnnotationLink)annot;
                            PDRectangle rect = link.getRectangle();
                            //need to reposition link rectangle to match text space
                            float x = rect.getLowerLeftX();
                            float y = rect.getUpperRightY();
                            float width = rect.getWidth();
                            float height = rect.getHeight();
                            int rotation = page.findRotation();
                            if( rotation == 0 )
                            {
                                PDRectangle pageSize = page.findMediaBox();
                                y = pageSize.getHeight() - y;
                            }
                            else if( rotation == 90 )
                            {
                                //do nothing
                            }
                            
                            Rectangle2D.Float awtRect = new Rectangle2D.Float( x,y,width,height );
                            stripper.addRegion( "" + j, awtRect );
                        }
                    }
                    
                    stripper.extractRegions( page );
                    
                    for( int j=0; j<annotations.size(); j++ )
                    {
                        PDAnnotation annot = (PDAnnotation)annotations.get( j );
                        if( annot instanceof PDAnnotationLink )
                        {
                            PDAnnotationLink link = (PDAnnotationLink)annot;
                            PDAction action = link.getAction();
                            String urlText = stripper.getTextForRegion( "" + j );
                            if( action instanceof PDActionURI )
                            {
                                PDActionURI uri = (PDActionURI)action;
                                System.out.println( "Page " + (i+1) +":'" + urlText + "'=" + uri.getURI() );
                            }

View Full Code Here

      return result;
  }


    private static void processPage(final List result, final PDPage page, final int pageNum) {
        try {
          final PDFTextStripperByArea stripper = new PDFTextStripperByArea();
          final List linkAnnotations = new ArrayList();
          final List linkRegions = new ArrayList();
            extractAnnotations(page, stripper, linkAnnotations, linkRegions);
            stripper.extractRegions(page);
            final Map uriMap = new HashMap();
            final Map textMap = new HashMap();
            collateLinks(linkAnnotations, linkRegions, uriMap, textMap, stripper);
            final Iterator it = uriMap.keySet().iterator();
            while (it.hasNext()) {

View Full Code Here

TOP

Related Classes of org.pdfbox.util.PDFTextStripperByArea

com.canoo.webtest.plugins.pdftest.htmlunit.pdfbox.PdfBoxPDFPage

org.pdfbox.cos.COSStream

org.pdfbox.examples.pdmodel.PrintURLs

org.pdfbox.examples.util.ExtractTextByArea

org.pdfbox.pdmodel.common.PDStream

java.util.ArrayList

java.awt.geom.Rectangle2D

java.util.Vector

java.io.StringWriter

java.util.Iterator

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.