Package org.pdfclown.tools

Examples of org.pdfclown.tools.TextExtractor


    {throw new RuntimeException(filePath + " file access error.",e);}

    Document document = file.getDocument();

    // 2. Plain text extraction from the document pages.
    TextExtractor extractor = new TextExtractor();
    for(Page page : document.getPages())
    {
      if(!promptNextPage(page, false))
        return false;

      // Extract plain text from the current page!
      System.out.println(extractor.extractPlain(page));
    }

    return true;
  }
View Full Code Here


    {throw new RuntimeException(filePath + " file access error.",e);}

    Document document = file.getDocument();

    // 2. Link extraction from the document pages.
    TextExtractor extractor = new TextExtractor();
    extractor.setAreaTolerance(2); // 2 pt tolerance on area boundary detection.
    boolean linkFound = false;
    for(Page page : document.getPages())
    {
      if(!promptNextPage(page, !linkFound))
        return false;

      Map<Rectangle2D,List<ITextString>> textStrings = null;
      linkFound = false;

      // Get the page annotations!
      PageAnnotations annotations = page.getAnnotations();
      if(annotations == null)
      {
        System.out.println("No annotations here.");
        continue;
      }

      // Iterating through the page annotations looking for links...
      for(Annotation annotation : annotations)
      {
        if(annotation instanceof Link)
        {
          linkFound = true;

          if(textStrings == null)
          {textStrings = extractor.extract(page);}

          Link link = (Link)annotation;
          Rectangle2D linkBox = link.getBox();

          // Text.
          /*
            Extracting text superimposed by the link...
            NOTE: As links have no strong relation to page text but a weak location correspondence,
            we have to filter extracted text by link area.
          */
          StringBuilder linkTextBuilder = new StringBuilder();
          for(ITextString linkTextString : extractor.filter(textStrings,linkBox))
          {linkTextBuilder.append(linkTextString.getText());}
          System.out.println("Link '" + linkTextBuilder + "' ");

          // Position.
          System.out.println(
View Full Code Here

    {throw new RuntimeException(filePath + " file access error.",e);}

    Document document = file.getDocument();

    // 2. Text extraction from the document pages.
    TextExtractor extractor = new TextExtractor();
    for(Page page : document.getPages())
    {
      if(!promptNextPage(page, false))
        return false;

      List<ITextString> textStrings = extractor.extract(page).get(null);
      for(ITextString textString : textStrings)
      {
        Rectangle2D textStringBox = textString.getBox();
        System.out.println(
          "Text ["
View Full Code Here

TOP

Related Classes of org.pdfclown.tools.TextExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.