Examples of org.pdfclown.tools.TextExtractor

org.pdfclown.tools.TextExtractor
fanochizzolini.it) @since 0.0.8 @version 0.1.0

    {throw new RuntimeException(filePath + " file access error.",e);}


    Document document = file.getDocument();


    // 2. Plain text extraction from the document pages.
    TextExtractor extractor = new TextExtractor();
    for(Page page : document.getPages())
    {
      if(!promptNextPage(page, false))
        return false;


      // Extract plain text from the current page!
      System.out.println(extractor.extractPlain(page));
    }


    return true;
  }

View Full Code Here

    {throw new RuntimeException(filePath + " file access error.",e);}


    Document document = file.getDocument();


    // 2. Link extraction from the document pages.
    TextExtractor extractor = new TextExtractor();
    extractor.setAreaTolerance(2); // 2 pt tolerance on area boundary detection.
    boolean linkFound = false;
    for(Page page : document.getPages())
    {
      if(!promptNextPage(page, !linkFound))
        return false;


      Map<Rectangle2D,List<ITextString>> textStrings = null;
      linkFound = false;


      // Get the page annotations!
      PageAnnotations annotations = page.getAnnotations();
      if(annotations == null)
      {
        System.out.println("No annotations here.");
        continue;
      }


      // Iterating through the page annotations looking for links...
      for(Annotation annotation : annotations)
      {
        if(annotation instanceof Link)
        {
          linkFound = true;


          if(textStrings == null)
          {textStrings = extractor.extract(page);}


          Link link = (Link)annotation;
          Rectangle2D linkBox = link.getBox();


          // Text.
          /*
            Extracting text superimposed by the link...
            NOTE: As links have no strong relation to page text but a weak location correspondence,
            we have to filter extracted text by link area.
          */
          StringBuilder linkTextBuilder = new StringBuilder();
          for(ITextString linkTextString : extractor.filter(textStrings,linkBox))
          {linkTextBuilder.append(linkTextString.getText());}
          System.out.println("Link '" + linkTextBuilder + "' ");


          // Position.
          System.out.println(

View Full Code Here

    {throw new RuntimeException(filePath + " file access error.",e);}


    Document document = file.getDocument();


    // 2. Text extraction from the document pages.
    TextExtractor extractor = new TextExtractor();
    for(Page page : document.getPages())
    {
      if(!promptNextPage(page, false))
        return false;


      List<ITextString> textStrings = extractor.extract(page).get(null);
      for(ITextString textString : textStrings)
      {
        Rectangle2D textStringBox = textString.getBox();
        System.out.println(
          "Text ["

View Full Code Here

TOP

Related Classes of org.pdfclown.tools.TextExtractor

org.pdfclown.documents.contents.ContentScanner

org.pdfclown.documents.contents.objects.ContentObject

org.pdfclown.documents.contents.TextChar

org.pdfclown.documents.contents.TextStyle

org.pdfclown.samples.cli.AdvancedPlainTextExtractionSample

org.pdfclown.samples.cli.AdvancedTextExtractionSample

org.pdfclown.samples.cli.LinkParsingSample

java.awt.geom.Rectangle2D

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.