Package org.jpedal.grouping

Source Code of org.jpedal.grouping.PdfGroupingAlgorithms

/**
* ===========================================
* Java Pdf Extraction Decoding Access Library
* ===========================================
*
* Project Info:  http://www.jpedal.org
* (C) Copyright 1997-2008, IDRsolutions and Contributors.
*
*   This file is part of JPedal
*
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


*
* ---------------
* PdfGroupingAlgorithms.java
* ---------------
*/
package org.jpedal.grouping;

import java.awt.Point;
import java.awt.Rectangle;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jpedal.PdfDecoder;

import org.jpedal.color.GenericColorSpace;
import org.jpedal.exception.PdfException;
import org.jpedal.objects.PdfData;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.Fonts;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Sorts;
import org.jpedal.utils.Strip;
import org.jpedal.utils.repositories.Vector_Float;
import org.jpedal.utils.repositories.Vector_Int;
import org.jpedal.utils.repositories.Vector_Object;
import org.jpedal.utils.repositories.Vector_Rectangle;
import org.jpedal.utils.repositories.Vector_String;

/**
* Applies heuristics to unstructured PDF text to create content
*/
public class PdfGroupingAlgorithms {
   
    private boolean includeHTMLtags=false;
   
    private int wordDetectionTechnique = 0;
   
    public static final int USER_DEFINED_LIST_ONLY = 0;
    public static final int SURROUND_BY_ANY_PUNCTUATION = 1;
   
    private static String SystemSeparator = System.getProperty("line.separator");
   
    //public PdfGroupingAlgorithms() {}
 
  /** ==============START OF ARRAYS================ */
  /**
   * content is stored in a set of arrays. We have tried various methods (ie
   * create composite object, etc) and none are entirely satisfactory. The
   * beauty of this method is speed.
   */

  /**
   * flag to show this item has been merged into another and should be
   * ignored. This allows us to repeat operations on live elements without
   * lots of deleting.
   */
  private boolean[] isUsed;
 
  /**
   * List of punctuation to allow before or after text
   * and still count as a whole word during search
   * Currently supported are </br>
   */
  private String punctuation = "" +
            '\u003A' + // Colon
            '\u005F' + // UnderScore
            '\u0020' + // Space
            '\u0028' + // Open Bracket
            '\u0029' + // Close Bracket
            '\u0021' + // Exclamation Point
            '\u003B' + // Semicolon
            '\u002E' + // Full Stop
            '\u002C' + // Comma
            '\u002F' + // Forward Slash
            '\u002D' + // Dash / Minus
            '\u003D' + // Equals
            '\u002B' + // Plus
            '\u003F' + // Question Mark
            '\u005B' + // Open Square Brackets
            '\u005D' + // Close Square Brackets
            '\u007B' + // Open Curly Brackets
            '\u007D' + // Close Curly Brackets
            '\u0084' + // Double Comma
            '\u0085' + // Ellipsis
            '\u0093' + // Smart Double Quote Open
            '\u0094' + // Smart Double Quote Close
            '\u0091' + // Smart Single Quote Open
            '\u0092' + // Smart Single Quote Close
            '\u201C' + // Left Double Quotation Mark
            '\u201D' + // Right Double Quotation Mark
      "\u005C\u005C" + // Back Slash
      "\u005C\u0027" + // Single Quotation mark
      "\u005C\u0022"// Double Quotation mark

  /** co-ords of object (x1,y1 is top left) */
  private float[] f_x1, f_x2, f_y1, f_y2;
 
  /**track if we removed space from end*/
  private boolean[] hadSpace;
 
  /**hold colour info*/
  private String[] f_colorTag;
 
  /**hold writing mode*/
  private int[] writingMode;
 
  /**hold move type*/
  private int[] moveType;

  /** font sizes in pixels */
  private int[] fontSize;

  /** amount of space a space uses in this font/size */
  private float[] spaceWidth;

  /** actual text */
  private StringBuffer[] content;

  /** raw number of text characters */
  private int[] textLength;

  /** ==============END OF ARRAYS================ */

    /**
   * handle on page data object. We extract data from this into local arrays
   * and return grouped content into object at end. This is done for speed.
   */
  private PdfData pdf_data;

    PdfPageData pageData;

  /** flag to show if output for table is CSV or XHTML */
  private boolean isXHTML = true;

  /** slot to insert next value - used when we split fragments for table code */
  private int nextSlot;

  /** vertical breaks for table calculation */
  private Vector_Int lineBreaks = new Vector_Int();

  /** holds details as we scan lines for table */
  private Vector_Object lines;

  /** lookup table used to sort into correct order for table */
  private Vector_Int lineY2;

  /**
   * marker char used in content (we bury location for each char so we can
   * split)
   */
  private static final String MARKER = PdfData.marker;
  public static char MARKER2= MARKER.charAt(0);

  /** counters for cols and rows and pointer to final object we merge into */
  private int max_rows = 0, master = 0;
 
  /**flag to show color info is being extracted*/
  private boolean colorExtracted=false;
 
  /** used to calculate correct order for table lines */
  private int[] line_order;

  /** amount we resize arrays holding content with if no space */
  private final static int increment = 100;

  public static boolean useUnrotatedCoords;

  /**end points if text located*/
  private float[] endPoints;

  /**flag to show if tease created on findText*/
  private boolean includeTease;

  /**teasers for findtext*/
  private String[] teasers;

  private List multipleTermTeasers = new ArrayList();

  private boolean usingMultipleTerms = false;

    private boolean isXMLExtraction=true;

    /*
      * Variables to allow cross line search results
      */
  /**Value placed between result areas to show they are part of the same result*/
  private int linkedSearchAreas=-101;
 
  /** create a new instance, passing in raw data */
  public PdfGroupingAlgorithms(PdfData pdf_data, PdfPageData pageData, boolean isXMLExtraction) {
    this.pdf_data = pdf_data;
    this.pageData=pageData;
        this.isXMLExtraction=isXMLExtraction;
    colorExtracted=pdf_data.isColorExtracted();
    }
  public static void setSeparator(String sep){
    SystemSeparator = sep;
  }
 
  /**
   * workout if we should use space, CR or no separator when joining lines
   */
  static final private String getLineDownSeparator(StringBuffer rawLine1, StringBuffer rawLine2, boolean isXMLExtraction) {

    String returnValue = " "; //space is default

    boolean hasUnderline = false;

    /**get 2 lines without any XML or spaces so we can look at last char*/
    StringBuffer line1,line2;
    if(isXMLExtraction){
      line1 = Strip.stripXML(rawLine1,isXMLExtraction);
      line2 = Strip.stripXML(rawLine2,isXMLExtraction);
    }else{
      line1 = Strip.trim(rawLine1);
      line2 = Strip.trim(rawLine2);
    }
   
    /**get lengths and if appropriate perform tests*/
    int line1Len = line1.length();
    int line2Len = line2.length();
    //System.out.println(line1Len+" "+line2Len);
    if((line1Len>1)&&(line2Len>1)){

      /**get chars to test*/
      char line1Char2 = line1.charAt(line1Len - 1);
      char line1Char1 = line1.charAt(line1Len - 2);
      char line2Char1 = line2.charAt(0);
      char line2Char2 = line2.charAt(1);

      //deal with hyphenation first - ignore unless :- or space-
            String hyphen_values = "";
            if (hyphen_values.indexOf(line1Char2) != -1) {
        returnValue = ""; //default of nothing
        if (line1Char1 == ':')
          returnValue = "\n";
        if (line1Char2 == ' ')
          returnValue = " ";
 
        //paragraph breaks if full stop and next line has ascii char or Capital Letter
      } else if (
        ((line1Char1 == '.') | (line1Char2 == '.'))
          & (Character.isUpperCase(line2Char1)
            | (line2Char1 == '&')
            | Character.isUpperCase(line2Char2)
            | (line2Char2 == '&'))){
        if(isXMLExtraction)
          returnValue = "<p></p>\n";
        else
          returnValue="\n";
      }

    }
   
    //add an underline if appropriate
    if (hasUnderline){
      if(isXMLExtraction)
        returnValue = returnValue + "<p></p>\n";
      else
        returnValue=returnValue+ '\n';
    }
   
    return returnValue;
  }

  /**
   * remove shadows from text created by double printing of text and drowned
   * items where text inside other text
   */
  private final void cleanupShadowsAndDrownedObjects(boolean avoidSpaces) {

    //get list of items
    int[] items = getUnusedFragments();
    int count = items.length;
    int c, n;
    String separator;
        float diff;

        //work through objects and eliminate shadows or roll together overlaps
    for (int p = 0; p < count; p++) {

      //master item
      c = items[p];

      //ignore used items
      if (isUsed[c] == false) {

        //work out mid point in text
        float midX = (f_x1[c] + f_x2[c]) / 2;
        float midY = (f_y1[c] + f_y2[c]) / 2;
       
        for (int p2 = p + 1;p2 < count;p2++) {

          //item to test against
          n = items[p2];
          if ((isUsed[n] == false) && (isUsed[c] == false)) {

            float fontDiff=this.fontSize[n]-fontSize[c];
            if(fontDiff<0)
              fontDiff=-fontDiff;

                        diff = (f_x2[n] - f_x1[n]) - (f_x2[c] - f_x1[c]);
                        if(diff<0)
                                diff=-diff;

                        /** stop spurious matches on overlapping text*/
            if (fontDiff==0 && (midX > f_x1[n])&& (midX < f_x2[n])
              && (diff< 10)
              && (midY < f_y1[n])&& (midY > f_y2[n])) {
             
              isUsed[n] = true;
             
              //pick up drowned text items (item inside another)     
            } else {
       
              boolean a_in_b =
                (f_x1[n] > f_x1[c])&& (f_x2[n] < f_x2[c])
                  && (f_y1[n] < f_y1[c])&& (f_y2[n] > f_y2[c]);
              boolean b_in_a =
                (f_x1[c] > f_x1[n])&& (f_x2[c] < f_x2[n])
                  && (f_y1[c] < f_y1[n])&& (f_y2[c] > f_y2[n]);
             
              //merge together
              if (a_in_b || b_in_a) {
                //get order right - bottom y2 underneath
                if (f_y2[c] > f_y2[n]) {
                  separator =getLineDownSeparator(content[c],content[n],isXMLExtraction);
                  if((avoidSpaces==false)||(separator.indexOf(' ')==-1)){
                    merge(c,n,separator,true);
                  }
                } else {
                  separator =getLineDownSeparator(content[n],content[c],isXMLExtraction);
                  if(!avoidSpaces || separator.indexOf(' ')==-1){
                    merge(n,c,separator,true);
                  }
                }
               
                //recalculate as may have changed
                midX = (f_x1[c] + f_x2[c]) / 2;
                midY = (f_y1[c] + f_y2[c]) / 2;
               
              }
            }
          }
        }
      }
    }
  }
 
  /**
   * general routine to see if we add a space between 2 text fragments
   */
  final private String isGapASpace(int c, int l, float actualGap,boolean addMultiplespaceXMLTag,int writingMode) {
    String sep = "";
    float gap;

    //use smaller gap
    float gapA = spaceWidth[c] * fontSize[c];
    float gapB = spaceWidth[l] * fontSize[l];

    if (gapA > gapB)
      gap = gapB;
    else
      gap = gapA;

        gap = (actualGap / (gap / 1000));

        //Round values to closest full integer as float -> int conversion rounds down
        if(gap > 0.51f && gap<1)
            gap = 1;

        int spaceCount = (int) gap;

    if (spaceCount > 0)
      sep = " ";

    /** add an XML tag to flag multiple spaces */
    if (spaceCount > 1 && addMultiplespaceXMLTag && writingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
      sep = " <SpaceCount space=\"" + spaceCount + "\" />";

    return sep;
  }

  /**
   * merge 2 text fragments together and update co-ordinates
   */
  final private void merge(int m,int c,String separator,boolean moveFont) {

      //update co-ords
      if (f_x1[m] > f_x1[c])
        f_x1[m] = f_x1[c];
      if (f_y1[m] < f_y1[c])
        f_y1[m] = f_y1[c];
      if (f_x2[m] < f_x2[c])
        f_x2[m] = f_x2[c];
      if (f_y2[m] > f_y2[c])
        f_y2[m] = f_y2[c];

      if(isXMLExtraction){
        String test=Fonts.fe;

        //add color tag if needed and changes
        if(colorExtracted)
          test=Fonts.fe+GenericColorSpace.ce;

        //move </Font> if needed and add separator
        if ((moveFont) && (content[m].toString().lastIndexOf(test)!=-1)) {
          String master = content[m].toString();
          content[m] =new StringBuffer(master.substring(0, master.lastIndexOf(test)));
          content[m].append(separator);
          content[m].append(master.substring(master.lastIndexOf(test)))
        } else{
          content[m].append(separator)
        }

                //Only map out space if text length is longer than 1
        if(textLength[c]>1 && content[m].toString().endsWith(" ")){
          content[m].deleteCharAt(content[m].lastIndexOf(" "));
        }
        //use font size of second text (ie at end of merged text)
        fontSize[m] = fontSize[c];
       
        //Remove excess / redundent xml tags
        if(content[c].indexOf("<color")!=-1 && content[m].indexOf("<color")!=-1){
          if(content[c].toString().startsWith(content[m].substring(content[m].lastIndexOf("<color"), content[m].indexOf(">", content[m].lastIndexOf("<color")))) &&
              content[m].lastIndexOf("</color>")+7==content[m].lastIndexOf(">")){
            content[c].replace(content[c].indexOf("<color"), content[c].indexOf(">")+1, "");
            content[m].replace(content[m].lastIndexOf("</color>"), content[m].lastIndexOf("</color>")+8, "");
          }
        }
       
        if(content[c].indexOf("<font")!=-1 && content[m].indexOf("<font")!=-1){
          if(content[c].toString().startsWith(content[m].substring(content[m].lastIndexOf("<font"), content[m].indexOf(">",content[m].lastIndexOf("<font")))) &&
              content[m].lastIndexOf("</font>")+6==content[m].lastIndexOf(">")){
            content[c].replace(content[c].indexOf("<font"), content[c].indexOf(">")+1, "");
            content[m].replace(content[m].lastIndexOf("</font>"), content[m].lastIndexOf("</font>")+7, "");
          }
        }
       
        content[m] = content[m].append(content[c]);
       
        //track length of text less all tokens
        textLength[m] = textLength[m] + textLength[c];

        //set objects to null to flush and log as used
        isUsed[c] = true;   
        content[c] = null;
      }else{

        //use font size of second text (ie at end of merged text)
        fontSize[m] = fontSize[c];

        //add together
        content[m] = content[m].append(separator).append(content[c]);

        //track length of text less all tokens
        textLength[m] = textLength[m] + textLength[c];

        //set objects to null to flush and log as used
        isUsed[c] = true;   
        content[c] = null;
      }
  }
 
  /**
   * remove width data we may have buried in data
   */
  final private void removeEncoding() {

    // get list of items
    int[] items = getUnusedFragments();
    int count = items.length;
    int current;

    // work through objects and eliminate shadows or roll together overlaps
        for (int item : items) {

            // master item
            current = item;

            // ignore used items and remove widths we hid in data
            if (isUsed[current] == false)
                content[current] = removeHiddenMarkers(current);
        }
  }

  /**
   * put raw data into Arrays for quick merging breakup_fragments shows if we
   * break on vertical lines and spaces
   */
  final private void copyToArrays() {

    colorExtracted=pdf_data.isColorExtracted();
   
    int count = pdf_data.getRawTextElementCount();

    //local lists for faster access
    isUsed = new boolean[count];
    fontSize = new int[count];
    writingMode=new int[count];
    spaceWidth = new float[count];
    content = new StringBuffer[count];
    textLength = new int[count];

    f_x1 = new float[count];
    f_colorTag=new String[count];
    f_x2 = new float[count];
    f_y1 = new float[count];
    f_y2 = new float[count];
    moveType=new int[count];


   
    //set values
    for (int i = 0; i < count; i++) {
      content[i] = new StringBuffer(pdf_data.contents[i]);

      fontSize[i] = pdf_data.f_end_font_size[i];
      writingMode[i]=pdf_data.f_writingMode[i];
      f_x1[i] = pdf_data.f_x1[i];
      f_colorTag[i]=pdf_data.colorTag[i];
      f_x2[i] = pdf_data.f_x2[i];
      f_y1[i] = pdf_data.f_y1[i];
      f_y2[i] = pdf_data.f_y2[i];
      moveType[i]=pdf_data.move_command[i];

      spaceWidth[i] = pdf_data.space_width[i];
      textLength[i] = pdf_data.text_length[i];
    }
  }
 
  /**
   * get list of unused fragments and put in list
   */
  private int[] getUnusedFragments() {
    int total_fragments = isUsed.length;

    //get unused item pointers
    int ii = 0;
    int temp_index[] = new int[total_fragments];
    for (int i = 0; i < total_fragments; i++) {
      if (isUsed[i] == false) {
        temp_index[ii] = i;
        ii++;
      }
    }
   
    //put into correctly sized array
    int[] items = new int[ii];
        System.arraycopy(temp_index, 0, items, 0, ii);
    return items;
  }


  /**
   * strip the hidden numbers of position we encoded into the data
   * (could be coded to be faster by not using Tokenizer)
   */
  private StringBuffer removeHiddenMarkers(int c) {

    //make sure has markers and ignore if not
    if (content[c].indexOf(MARKER) == -1)
      return content[c];
   
    //strip the markers
    StringTokenizer tokens =new StringTokenizer(content[c].toString(), MARKER, true);
    String temp;
    StringBuffer processedData = new StringBuffer();
   
    //with a token to make sure cleanup works
    while (tokens.hasMoreTokens()) {

      //strip encoding in data
      temp = tokens.nextToken(); //see if first marker
     
      if (temp.equals(MARKER)) {
        tokens.nextToken(); //point character starts
        tokens.nextToken(); //second marker
        tokens.nextToken(); //width
        tokens.nextToken(); //third marker

        //put back chars
        processedData = processedData.append(tokens.nextToken());
       
      } else
        processedData = processedData.append(temp);
    }
   
    return processedData;
  }

    /**
     * sets if we include HTML in teasers
     * (do we want this is <b>word</b> or this is word as teaser)
     * @param value
     */
    public void setIncludeHTML(boolean value) {
        includeHTMLtags=value;
    }
 
  /**
   * method to show data without encoding
   */
  public static String removeHiddenMarkers(String contents) {

    //trap null
    if(contents==null)
      return null;
   
    //run though the string extracting our markers

    //make sure has markers and ignore if not
    if (!contents.contains(MARKER))
      return contents;

    //strip the markers
    StringTokenizer tokens = new StringTokenizer(contents, MARKER, true);
    String temp_token;
    StringBuffer processed_data = new StringBuffer();
   
    //with a token to make sure cleanup works
    while (tokens.hasMoreTokens()) {

      //encoding in data
      temp_token = tokens.nextToken(); //see
                                     // if
                                     // first
                                     // marker
      if (temp_token.equals(MARKER)) {
        tokens.nextToken(); //point character starts
        tokens.nextToken(); //second marker
        tokens.nextToken(); //width
        tokens.nextToken(); //third marker

        //put back chars
        processed_data = processed_data.append(tokens.nextToken());
        //value
      } else
        processed_data = processed_data.append(temp_token);
    }
    return processed_data.toString();
  }

  /**
   * Method to try and find vertical lines in close data
   * (not as efficient as it could be)
   * @throws PdfException
   */
  private void findVerticalLines(float minX,float minY,float maxX,float maxY,int currentWritingMode) throws PdfException {

    //hold counters on all x values
    HashMap xLines = new HashMap();

    //counter on most popular item
    int most_frequent = 0, count = pdf_data.getRawTextElementCount();
    float x1, x2, y1, y2;
    String raw;

    for (int i = 0; i < count; i++) {
      float currentX = 0, lastX;
      Integer intX;

      //extract values for data
      raw = this.pdf_data.contents[i];

      /**
       * set pointers so left to right text
       */
      if(currentWritingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
        x1=this.f_x1[i];
        x2=this.f_x2[i];
        y1=this.f_y1[i];
        y2=this.f_y2[i];
      }else if(currentWritingMode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
        x2=this.f_x1[i];
        x1=this.f_x2[i];
        y1=this.f_y1[i];
        y2=this.f_y2[i];
      }else if(currentWritingMode==PdfData.VERTICAL_BOTTOM_TO_TOP){
        x1=this.f_y1[i];
        x2=this.f_y2[i];
        y1=this.f_x2[i];
        y2=this.f_x1[i];
      }else if(currentWritingMode==PdfData.VERTICAL_TOP_TO_BOTTOM){
        x1=this.f_y2[i];
        x2=this.f_y1[i];
        y2=this.f_x1[i];
        y1=this.f_x2[i];
      }else{
        throw new PdfException("Illegal value "+currentWritingMode+"for currentWritingMode");
      }
     
      //if in the area, process
      if ((x1 > minX - .5)&& (x2 < maxX + .5)&& (y2 > minY - .5)&& (y1 < maxY + .5)) {

        //run though the string extracting our markers to get x values
        StringTokenizer tokens =new StringTokenizer(raw, MARKER, true);
        String value, lastValue = "";
        Object currentValue;

        while (tokens.hasMoreTokens()) {

          //encoding in data
          value = tokens.nextToken(); //see if first marker
          if (value.equals(MARKER)) {

            value = tokens.nextToken(); //point character starts

            if (value.length() > 0) {

              lastX = currentX;
              currentX = Float.parseFloat(value);
              try {

                //add x to list or increase counter at start
                // or on space
                //add points either side of space
                if (lastValue.length() == 0 || (lastValue.indexOf(' ') != -1)) {

                  intX = (int) currentX;
                  currentValue = xLines.get(intX);
                  if (currentValue == null) {
                    xLines.put(intX, 1);
                  } else {
                    int countReached = (Integer) currentValue;
                    countReached++;

                    if (countReached > most_frequent)
                      most_frequent = countReached;

                    xLines.put(intX, countReached);
                  }

                  //work out the middle
                  int middle =(int) (lastX+ ((currentX - lastX) / 2));

                  if (lastX != 0) {
                    intX = middle;
                    currentValue = xLines.get(intX);
                    if (currentValue == null) {
                      xLines.put(intX, 1);
                    } else {
                      int count_reached = (Integer) currentValue;
                      count_reached++;

                      if (count_reached > most_frequent)
                        most_frequent = count_reached;

                      xLines.put(intX, count_reached);
                    }
                  }
                }

              } catch (Exception e) {
                LogWriter.writeLog(
                  "Exception " + e + " stripping x values");
              }
            }

            tokens.nextToken(); //second marker
            tokens.nextToken(); //glyph  width
            tokens.nextToken(); //third marker
            value = tokens.nextToken(); //put back chars
            lastValue = value;

          }
        }
      }
    }

    //now analyse the data
    Iterator keys = xLines.keySet().iterator();
    int minimum_needed =  most_frequent / 2;

    while (keys.hasNext()) {
      Integer current_key = (Integer) keys.next();
      int current_count = (Integer) xLines.get(current_key);

      if (current_count > minimum_needed)
        lineBreaks.addElement(current_key);

    }
  }

  /**
   * Method splitFragments adds raw frgaments to processed fragments breaking
   * up any with vertical lines through or what looks like tabbed spaces
   * @throws PdfException
   */
  private void copyToArrays(
      float minX,float minY,float maxX,float maxY,
      boolean keepFont,boolean breakOnSpace,boolean findLines,String punctuation, boolean isWordlist) throws PdfException {

    final boolean debugSplit=false;
   
    //initialise local arrays allow for extra space
    int count = pdf_data.getRawTextElementCount() + increment;
   
    f_x1 = new float[count];
    f_colorTag=new String[count];
    hadSpace=new boolean[count];
    f_x2 = new float[count];
    f_y1 = new float[count];
    f_y2 = new float[count];
   
    spaceWidth = new float[count];
    content = new StringBuffer[count];
    fontSize = new int[count];
    textLength = new int[count];
    writingMode=new int[count];
    isUsed=new boolean[count];
    moveType=new int[count];
   
    //flag to find lines based on orientation of first text item*/
    boolean linesScanned=false;
   
    //set defaults and calculate dynamic values
    int text_length;
    count = count-increment;
    float last_pt,min,max,pt,x1,x2,y1,y2,linePos,character_spacing;
    String raw, char_width = "",currentColor;
    StringBuffer text = new StringBuffer();
   
    //work through fragments
    for (int i = 0; i < count; i++) {
     
      //extract values
      character_spacing = pdf_data.f_character_spacing[i];
      raw = pdf_data.contents[i];
      x1 = pdf_data.f_x1[i];
      currentColor=pdf_data.colorTag[i];
      x2 = pdf_data.f_x2[i];
      y1 = pdf_data.f_y1[i];
      y2 = pdf_data.f_y2[i];
      text_length = pdf_data.text_length[i];
      int mode=pdf_data.f_writingMode[i];
      int moveType=pdf_data.move_command[i];
     
      /**
       * see if in area
       */
      boolean accepted=false;

      if(debugSplit){
          System.out.println("raw data="+raw);
          System.out.println("text data="+PdfGroupingAlgorithms.removeHiddenMarkers(raw));
            }

      //if at least partly in the area, process
      if ((mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT) &&
          y2 > minY && y1 < maxY && x1<maxX && x2>minX){
        accepted=true;
      }else if((mode==PdfData.VERTICAL_BOTTOM_TO_TOP || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)&&
          x1 > minX && x2 < maxX && y1>minY && y2<maxY)
        accepted=true;
     
      if(accepted){
       
        /**find lines*/
        //look for possible vertical or horizontal lines in the data
        if((!linesScanned)&&(findLines)){
          findVerticalLines(minX, minY, maxX, maxY,mode);
          linesScanned=true;
        }
       
        /**
         * initialise pointers and work out an
         * 'average character space'
         **/
        if (mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
          //space = (x2 - x1) / text_length;
          pt = x1;
          last_pt = x1;
          min=minX;
          max=maxX;
        }else{ //vertical text
          //space = (y1 - y2) / text_length;
          pt = y2;
          last_pt = y2;
          min=minY;
          max=maxY;
        }
       
        linePos = -1;
       
        /**
         * work through text, using embedded markers to work out whether
         * each letter is IN or OUT
         */
        char[] line=raw.toCharArray();
       
        int end=line.length;
        int pointer=0;
       
        String value, textValue = "", pt_reached;

        //allow for no tokens and return all text fragment
        if (!raw.contains(MARKER))
          text = new StringBuffer(raw);
       
        boolean isFirstValue=true, breakPointset=false;
       
        /**
         * work through text, using embedded markers to work out whether
         * each letter is IN or OUT
         */
        while(pointer<end){
         
          //only data between min and y locations
          while (true) {
           
            /**
             * read value
             */
           
            if(line[pointer]!=MARKER2){
              //find second marker and get width
              int startPointer=pointer;
              while((pointer<end)&&(line[pointer]!=MARKER2))
                  pointer++;
              value = raw.substring(startPointer,pointer);
             
            }else{//if (value.equals(MARKER)) { // read the next token and its location and width
             
              //find first marker
              while((pointer<end)&&(line[pointer]!=MARKER2))
                  pointer++;
             
              pointer++;
               
              //find second marker and get width
              int startPointer=pointer;
              while((pointer<end)&&(line[pointer]!=MARKER2))
                  pointer++;
              pt_reached = raw.substring(startPointer,pointer);
              pointer++;
             
              //find third marker
              startPointer=pointer;
              while((pointer<end)&&(line[pointer]!=MARKER2))
                  pointer++;
             
              char_width=raw.substring(startPointer,pointer);
              pointer++;
               
              //find next marker
              startPointer=pointer;
              while((pointer<end)&&(line[pointer]!=MARKER2))
                pointer++;
             
              value = raw.substring(startPointer,pointer);
             
              textValue = value; //keep value with no spaces
             
              if (pt_reached.length() > 0) { //set point character starts
                last_pt = pt;
                pt = Float.parseFloat(pt_reached)

                                if(breakPointset){
                                  if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
                          x1 = pt;
                        else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
                          x2 = pt;
                        else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
                          y2=pt; 
                        else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
                          y1=pt;
                                    breakPointset=false;
                                }
              }
             
              //add font start if needed
              if ((isXMLExtraction)&&(last_pt < min)&& (pt > min)&& (!value.startsWith(Fonts.fb)))
                value = Fonts.getActiveFontTag(raw, "")+ value;
             
            }
           
            if ((pt > min) & (pt < max)){
              if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
                  if((x1<min || x1>max) && pt>=min)
                    x1 = pt;
                else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
                  if((x2>max || x2<min) && pt<=max)
                    x2 = pt;
                else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
                  if((y2<min || y2>max) && pt>=min)
                    y2=pt; 
                else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
                  if((y1<min || y1>max) && pt<=min)
                    y1=pt;
              break;
            }
           
            value = "";
            textValue = "";
           
            if(pointer>=end)
              break;
          }
         
          /**make sure font not sliced off on first value*/
          if((isFirstValue)){
           
            isFirstValue=false;
            if((isXMLExtraction)&&(keepFont)&&(!value.startsWith(Fonts.fb))&&(!value.startsWith(GenericColorSpace.cb)))//&&(!text.toString().startsWith(Fonts.fb))))
            text.append(Fonts.getActiveFontTag(text.toString(), raw));
          }
         
          /**
           * we now have a valid value inside the selected area so perform tests
           */
          //see if a break occurs
          boolean is_broken = false;
          if(findLines && character_spacing > 0 && text.toString().endsWith(" ")) {
            int counts = lineBreaks.size();
            for (int jj = 0; jj < counts; jj++) {
              int test_x = lineBreaks.elementAt(jj);
              if ((last_pt < test_x) & (pt > test_x)) {
                jj = counts;
                is_broken = true;
              }
            }
          }
         
          boolean endsWithPunctuation = checkForPunctuation(textValue,punctuation);
         
          if (is_broken) { //break on double-spaces or larger
           
            if(debugSplit)
            System.out.println("Break 1 is_broken");
             
            float Nx1=x1,Nx2=x2,Ny1=y1,Ny2=y2;
            if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
              Nx2 = last_pt + Float.parseFloat(char_width);
            else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
              Nx1 = last_pt + Float.parseFloat(char_width);
            else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
              Ny1=last_pt + Float.parseFloat(char_width);
            else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
              Ny2=last_pt + Float.parseFloat(char_width);
           
            addFragment(moveType,i,text,Nx1,Nx2,Ny1,Ny2,text_length,keepFont,currentColor,isWordlist);
            text =new StringBuffer(Fonts.getActiveFontTag(text.toString(), raw));
            text.append(value);
           
            if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
              x1 = pt;
            else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
              x2 = pt;
            else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
              y2=pt; 
            else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
              y1=pt;
           
          } else if ((endsWithPunctuation)|
              ((breakOnSpace) && ((textValue.indexOf(' ') != -1)||(value.endsWith(" "))))|((textValue.contains("   ")))) {//break on double-spaces or larger
            if(debugSplit)
            System.out.println("Break 2 endsWithPunctuation="+endsWithPunctuation+" textValue="+textValue+ '<'+" value="+value+ '<' +" text="+text+ '<');
           
           
            //Remove final bit of the below if to fix issue in case 11542
            if(textValue.length()>1 && textValue.indexOf(' ')!=-1){// && x1==pt){ //add in space values to start of next shape
              //count the spaces
              int ptr=textValue.indexOf(' ');
             
              if(ptr>0){
                pt=pt+ ptr*(Float.parseFloat(char_width)/textValue.length());
              }
              //else
              //  pt=pt+Float.parseFloat(char_width);

            }
           
            if (!endsWithPunctuation)
            text.append(value.trim());
           
           
                       
            if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){

                            if(debugSplit)
                            System.out.println("Add "+x1+ ' ' +pt+" text="+text+" i="+i);
              addFragment(moveType,i,text,x1,pt,y1,y2,text_length,keepFont,currentColor,isWordlist);
                        }else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
                            if(debugSplit)
                            System.out.println("b");
              addFragment(moveType,i,text,pt,x2,y1,y2,text_length,keepFont,currentColor,isWordlist);
                        }else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP){
                            if(debugSplit)
                            System.out.println("c");
              addFragment(moveType,i,text,x1,x2,pt,y2,text_length,keepFont,currentColor,isWordlist);
                        }else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM) {
                            if(debugSplit)
                            System.out.println("d");
              addFragment(moveType,i,text,x1,x2,y1,pt,text_length,keepFont,currentColor,isWordlist);
                        }
           
            if(char_width.length()>0){ //add in space values to start of next shape
                            //count the spaces
                            int ptr=0;
                           
                            if(textValue.indexOf(' ')!=-1)
                              ptr=textValue.indexOf(' ');
                           
                            if(isWordlist){
                                int len=textValue.length();
                                while(ptr<len && textValue.charAt(ptr)==' '){
                                    ptr++;
                                }
                            }
                           
                            if(ptr>0)
                  pt=pt+ ptr*Float.parseFloat(char_width);
                            else
                                pt=pt+Float.parseFloat(char_width);
                           
                            if(ptr>0)
                              breakPointset=true;
                            else
                              breakPointset=false;

                        }

            //store fact it had a space in case we generate wordlist
            if((breakOnSpace)&(nextSlot>0))
            hadSpace[nextSlot-1]=true;
           
            text =new StringBuffer(Fonts.getActiveFontTag(text.toString(), raw));
            if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
              x1 = pt;// + space;
            else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
              x2 = pt;// - space;
            else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
              y2 = pt;// + space;
            else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
              y1 = pt;// - space;           
           
          } else if ((linePos != -1) & (pt > linePos)) {//break on a vertical line
           
            if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
              addFragment(moveType,i,text,x1,linePos,y1,y2,text_length,keepFont,currentColor,isWordlist);
            else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
              addFragment(moveType,i,text,linePos,x2,y1,y2,text_length,keepFont,currentColor,isWordlist);
            else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
              addFragment(moveType,i,text,x1,x2,linePos,y2,text_length,keepFont,currentColor,isWordlist);
            else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
              addFragment(moveType,i,text,x1,x2,y1,linePos,text_length,keepFont,currentColor,isWordlist);
           
            text =new StringBuffer(Fonts.getActiveFontTag(text.toString(), raw));
            text.append(value);
           
            if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
              x1 = linePos;
            else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
              x2 = linePos;
            else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
              y2 = linePos;
            else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
              y1 = linePos;
           
            linePos = -1;
           
          } else { //allow for space used as tab
            if ((isXMLExtraction)&&(value.endsWith(' ' +Fonts.fe))) {
              value = Fonts.fe;
              textValue = "";
             
              if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT)
                x2 = last_pt;
              else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
                x1=last_pt;
              else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
                y1 = last_pt;
              else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
                y2 = last_pt;   
            }
            text.append(value);
          }
         
        }
       
        //trap scenario we found if all goes through with no break at end
        if((keepFont)&&(isXMLExtraction)&&
            (!text.toString().endsWith(Fonts.fe))&&
            (!text.toString().endsWith(GenericColorSpace.ce)))
          text.append(Fonts.fe);
       
        //create new line with what is left and output
        if (mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){ 
          if (x1 < x2)
            addFragment(moveType,i,text,x1,x2,y1,y2,text_length,keepFont,currentColor,isWordlist);
        }else if (mode==PdfData.VERTICAL_BOTTOM_TO_TOP || mode==PdfData.VERTICAL_TOP_TO_BOTTOM){
          if(y1 > y2)
            addFragment(moveType,i,text,x1,x2,y1,y2,text_length,keepFont,currentColor,isWordlist);
        }
        text = new StringBuffer();
       
      }
    }

    //local lists for faster access
    isUsed = new boolean[nextSlot];

  }

  /**
   * @param textValue
   * @return
   */
  private static boolean checkForPunctuation(String textValue,String punctuation) {
   
    if(punctuation==null)
      return false;
   
    /** see if ends with punctuation */
    boolean endsWithPunctuation = false;
    int textLength = textValue.length();
    int ii = textLength - 1;
    if (textLength > 0) { //strip any spaces and tags in test
      char testChar = textValue.charAt(ii);
      boolean inTag = (testChar == '>');
      while (((inTag) | (testChar == ' ')) & (ii > 0)) {
       
        if (testChar == '<')
          inTag = false;
       
        ii--;
        testChar = textValue.charAt(ii);
       
        if (testChar == '>')
          inTag = true;
      }
     
      //stop  matches on &;
      if((testChar==';')){
        //ignore if looks like &xxx;
        endsWithPunctuation = true;
        ii--;
        while(ii>-1){
         
          testChar=textValue.charAt(ii);
          if(testChar=='&' || testChar=='#'){
            endsWithPunctuation = false;
            ii=0;
          }
         
          if(ii==0 || testChar==' ' || !Character.isLetterOrDigit(testChar))
              break;
         
          ii--;
        }
      }else if (punctuation.indexOf(testChar) != -1)
        endsWithPunctuation = true;
     
    }
    return endsWithPunctuation;
  }

  /**
   * add an object to our new XML list
   */
  private void addFragment(
      int moveType,
      int index,
      StringBuffer contentss,
      float x1,
      float x2,
      float y1,
      float y2,
      int text_len,
      boolean keepFontTokens,String currentColorTag,boolean isWordlist) {

    StringBuffer current_text = contentss;
    String str=current_text.toString();
   
    //strip <> or ascii equivalents
    if(isWordlist){
      if(str.contains("&#"))
        current_text=Strip.stripAmpHash(current_text);
     
      if((isXMLExtraction)&&((str.contains("&lt;"))||(str.contains("&gt;"))))
        current_text=Strip.stripXMLArrows(current_text,true);
      else if((!isXMLExtraction)&&((str.indexOf('<')!=-1)||(str.indexOf('>')!=-1)))
        current_text=Strip.stripArrows(current_text);
    }
   
//    StringBuffer justText=Strip.stripXML(current_text);

    //ignore blank space objects
    //if (justText.length() == 0) {
     
    if(getFirstChar(current_text)!=-1){

      //strip tags or pick up missed </font> if ends with space
      if (keepFontTokens == false) {

        //strip fonts if required
        current_text = Strip.stripXML(current_text,isXMLExtraction);

      } else if (isXMLExtraction){
       
        //no color tag
          if(pdf_data.isColorExtracted()&&(!current_text.toString().endsWith(GenericColorSpace.ce))){
           
            //se
            //if ends </font> add </color>
            //otherwise add </font></color>
            if(!current_text.toString().endsWith(Fonts.fe))
              current_text = current_text.append(Fonts.fe);
            current_text = current_text.append(GenericColorSpace.ce);
           
          }else if((!pdf_data.isColorExtracted())&&(!current_text.toString().endsWith(Fonts.fe)))      
                current_text = current_text.append(Fonts.fe);               
      }
     
      /***/
      //add to vacant slot or create new slot
      int count = f_x1.length;
     
      if (nextSlot < count) {

        f_x1[nextSlot] = x1;
        f_colorTag[nextSlot]=currentColorTag;
        f_x2[nextSlot] = x2;
        f_y1[nextSlot] = y1;
        f_y2[nextSlot] = y2;
        this.moveType[nextSlot]=moveType;

        fontSize[nextSlot] = pdf_data.f_end_font_size[index];
        writingMode[nextSlot]=pdf_data.f_writingMode[index];
        textLength[nextSlot] = text_len;

        spaceWidth[nextSlot] = pdf_data.space_width[index];
        content[nextSlot] = current_text;

        nextSlot++;
      } else {
        count = count + increment;
        float[] t_x1 = new float[count];
        String[] t_colorTag=new String[count];
        float[] t_x2 = new float[count];
        float[] t_y1 = new float[count];
        float[] t_y2 = new float[count];
        float[] t_spaceWidth = new float[count];

        StringBuffer[] t_content = new StringBuffer[count];

        int[] t_font_size = new int[count];
        int[] t_text_len = new int[count];
        int[] t_writingMode=new int[count];
       
        int[] t_moveType=new int[count];
       
        boolean[] t_isUsed = new boolean[count];
       
        boolean[]t_hadSpace=new boolean[count];
       
        //copy in existing
        for (int i = 0; i < count - increment; i++) {
          t_x1[i] = f_x1[i];
          t_colorTag[i]=f_colorTag[i];
          t_x2[i] = f_x2[i];
          t_y1[i] = f_y1[i];
          t_y2[i] = f_y2[i];
          t_hadSpace[i]=hadSpace[i];
          t_spaceWidth[i] = spaceWidth[i];
          t_content[i] = content[i];
          t_font_size[i] = fontSize[i];
          t_writingMode[i]=writingMode[i];
          t_text_len[i] = textLength[i];
          t_isUsed[i] = isUsed[i];
          t_moveType[i]=this.moveType[i];
        }

        f_x1 = t_x1;
        f_colorTag=t_colorTag;
        hadSpace=t_hadSpace;
        f_x2 = t_x2;
        f_y1 = t_y1;
        f_y2 = t_y2;
        isUsed=t_isUsed;
       
        fontSize = t_font_size;
        writingMode=t_writingMode;
        textLength = t_text_len;

        spaceWidth = t_spaceWidth;

        content = t_content;
       
        this.moveType=t_moveType;

        f_x1[nextSlot] = x1;
        f_colorTag[nextSlot]=currentColorTag;
        f_x2[nextSlot] = x2;
        f_y1[nextSlot] = y1;
        f_y2[nextSlot] = y2;

        fontSize[nextSlot] = pdf_data.f_end_font_size[index];
        writingMode[nextSlot]=pdf_data.f_writingMode[index];
        t_text_len[nextSlot] = text_len;
        content[nextSlot] = current_text;

        spaceWidth[nextSlot] = pdf_data.space_width[index];
       
        this.moveType[nextSlot]=moveType;

        nextSlot++;

      } /***/

    }
  }

  //////////////////////////////////////////////////////////////////////
  /**
   * put rows together into one object with start and end
   */
  private void mergeTableRows(int border_width) {

    //merge row contents
    String separator ="</tr>\n<tr>";
   
    if (isXHTML == false)
      separator = "\n";

    master = ((Vector_Int) lines.elementAt(line_order[0])).elementAt(0);

    int item;
    for (int rr = 1; rr < max_rows; rr++) {

      item =((Vector_Int) lines.elementAt(line_order[rr])).elementAt(0);
      if(content[master]==null)
        master=item;
      else if(content[item]!=null)
      merge(master,item,separator,false);
    }

    //add start/end marker
    if (isXHTML) {
      if (border_width == 0){
        content[master].insert(0,"<TABLE>\n<tr>");
        content[master].append("</tr>\n</TABLE>\n");
      }else{
        StringBuffer startTag=new StringBuffer("<TABLE border='");
        startTag.append(String.valueOf(border_width));
        startTag.append( "'>\n<tr>");
        startTag.append(content[master]);
        content[master]=startTag;
        content[master].append("</tr>\n</TABLE>\n");
      }
    }

  }

  //////////////////////////////////////////////////
  /**
   * get list of unused fragments and put in list and sort in sorted_items
   */
  final private int[] getsortedUnusedFragments(
    boolean sortOnX,
    boolean use_y1) {
    int total_fragments = isUsed.length;

    //get unused item pointers
    int ii = 0;
    int sorted_temp_index[] = new int[total_fragments];
    for (int i = 0; i < total_fragments; i++) {
      if (isUsed[i] == false) {
        sorted_temp_index[ii] = i;
        ii++;
      }
    }
   
    int[] unsorted_items = new int[ii];
    int[] sorted_items;
    int[] sorted_temp_x1 = new int[ii];
    int[] sorted_temp_y1 = new int[ii];
    int[] sorted_temp_y2 = new int[ii];

    //put values in array and get x/y for sort
    for (int pointer = 0; pointer < ii; pointer++) {
      int i = sorted_temp_index[pointer];
      unsorted_items[pointer] = i;
     
      sorted_temp_x1[pointer] = (int) f_x1[i];

      //negative values to get sort in 'wrong' order from top of page
      sorted_temp_y1[pointer] = (int) f_y1[i];
      sorted_temp_y2[pointer] = (int) f_y2[i];

    }

    //sort
    if (sortOnX == false) {
      if (use_y1 == true)
        sorted_items =
          Sorts.quicksort(
            sorted_temp_y1,
            sorted_temp_x1,
            unsorted_items);
      else
        sorted_items =
          Sorts.quicksort(
            sorted_temp_y2,
            sorted_temp_x1,
            unsorted_items);
    } else
      sorted_items =
        Sorts.quicksort(sorted_temp_x1, sorted_temp_y1, unsorted_items);
   
    return sorted_items;
  }

  //////////////////////////////////////////////////////////////////////
  /**
   * create rows of data from preassembled indices, adding separators. Each
   * row is built to a temp array and then row created - we don't know how
   * many columns until the table is built
   * @throws PdfException
   */
  private void createTableRows(
    boolean keep_alignment_information,
    boolean keep_width_information,int currentWritingMode) throws PdfException {

    /**
     * create local copies of arrays
     */
    float[] f_x1,f_x2;
   
    /**
     * set pointers so left to right text
     */
    if(currentWritingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
      f_x1=this.f_x1;
      f_x2=this.f_x2;
      //f_y1=this.f_y1;
      //f_y2=this.f_y2;
    }else if(currentWritingMode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
      f_x2=this.f_x1;
      f_x1=this.f_x2;
      //f_y1=this.f_y1;
      //f_y2=this.f_y2;
    }else if(currentWritingMode==PdfData.VERTICAL_BOTTOM_TO_TOP){
      f_x1=this.f_y2;
      f_x2=this.f_y1;
      //f_y1=this.f_x2;
      //f_y2=this.f_x1;
    }else if(currentWritingMode==PdfData.VERTICAL_TOP_TO_BOTTOM){
      f_x1=this.f_y1;
      f_x2=this.f_y2;
      //f_y2=this.f_x1;
      //f_y1=this.f_x2;
     
      /**
       * fiddle x,y co-ords so it works
       */
     
      //get max size
      int maxX=0;
            for (float aF_x1 : f_x1) {
                if (maxX < aF_x1)
                    maxX = (int) aF_x1;
            }
     
      maxX++; //allow for fp error
      //turn around
      for(int ii=0;ii<f_x2.length;ii++){
        f_x1[ii]=maxX-f_x1[ii];
        f_x2[ii]=maxX-f_x2[ii];
      }
     
    }else{
      throw new PdfException("Illegal value "+currentWritingMode+"for currentWritingMode");
    }

    int item, i, current_col = -1;

    int itemsInTable = 0, items_added = 0;
    //pointer to current element on each row
    int[] currentItem = new int[max_rows];

    Vector_Int[] rowContents = new Vector_Int[max_rows];
    Vector_String alignments = new Vector_String(); //text alignment
    Vector_Float widths = new Vector_Float(); //cell widths
    Vector_Float cell_x1 = new Vector_Float(); //cell widths
    String separator = "", empty_cell = "&nbsp;";

    if (isXHTML == false) {
      separator = "\",\"";
      empty_cell = "";
    }

    /**
     * set number of items on each line, column count and populate empty rows
     */
    int[] itemCount = new int[max_rows];
    for (i = 0; i < max_rows; i++) {
      itemCount[i] = ((Vector_Int) lines.elementAt(i)).size() - 1;

      //total number of items
      itemsInTable = itemsInTable + itemCount[i];

      //reset other values
      currentItem[i] = 0;
      rowContents[i] = new Vector_Int(20);
    }

    //now work through and split any overlapping items until all done
    while (true) {

      //size of column and pointers
      float x1 = 9999,min_x2 = 9999,x2,current_x1,current_x2,c_x1,next_x1 = 9999,c_x2,items_in_column = 0;
     
      current_col++;
      boolean all_done = true; //flag to exit at end
      float total_x1 = 0, total_x2 = 0, left_gap = 0, right_gap;

      String alignment = "center";

      if (items_added < itemsInTable) {

        /**
         * work out cell x boundaries on basis of objects
         */
        for (i = 0; i < max_rows; i++) { //get width for column
          if (itemCount[i] > currentItem[i]) { //item  id
           
            item = ((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
            current_x1 = f_x1[item];
            current_x2 = f_x2[item];
           
            if (current_x1 < x1) //left margin
              x1 = current_x1;
            if (current_x2 < min_x2) //right margin if appropriate
              min_x2 = current_x2;
           
          }
        }
       
        cell_x1.addElement(x1); //save left margin
        x2 = min_x2; //set default right margin

        /**
         * workout end and next column start by scanning all items
         */
        for (i = 0;i < max_rows;i++) { //slot the next item on each row together work out item
          item = ((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
          c_x1 = f_x1[item];
          c_x2 = f_x2[item];

          //max item width of this column
          if ((c_x1 >= x1) & (c_x1 < min_x2) & (c_x2 > x2))
            x2 = c_x2;

          if (currentItem[i] < itemCount[i]) { //next left margin

            item =((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i] + 1);
            current_x1 = f_x1[item];
            if ((current_x1 > min_x2) & (current_x1 < next_x1))
              next_x1 = current_x1;
          }
        }

                //stop infinite loop case
                if(x1==x2)
                    break;

        //allow for last column
        if (next_x1 == 9999)
          next_x1 = x2;
     
        /**
         * count items in table and workout raw totals for alignment.
         * Also work out widest x2 in column
         */
        for (i = 0;i < max_rows;i++) { //slot the next item on each row together

          //work out item
          item =((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
          c_x1 = f_x1[item];
          c_x2 = f_x2[item];

          //use items in first column of single colspan
          if ((c_x1 >= x1) & (c_x1 < min_x2) & (c_x2 <= next_x1)) {

            //running totals to calculate alignment
            total_x1 = total_x1 + c_x1;
            total_x2 = total_x2 + c_x2;
            items_in_column++;

          }
        }
       
        /**
         * work out gap and include empty space between cols and save
         */
        if (i == 0)
          left_gap = x1;
        if (next_x1 == -1)
          right_gap = 0;
        else
          right_gap = (int) ((next_x1 - x2) / 2);

        int width = (int) (x2 - x1 + right_gap + left_gap);
                //noinspection UnusedAssignment,UnusedAssignment
                left_gap = right_gap;
        widths.addElement(width);

        /** workout the alignment */
        float x1_diff = (total_x1 / items_in_column) - x1;
        float x2_diff = x2 - (total_x2 / items_in_column);
        if (x1_diff < 1)
          alignment = "left";
        else if (x2_diff < 1)
          alignment = "right";
        alignments.addElement(alignment);

        for (i = 0;i < max_rows;i++) { //slot the next item on each row together
          master = ((Vector_Int) lines.elementAt(i)).elementAt(0);
          //get next item on line or -1 for no more
          if (itemCount[i] > currentItem[i]) {
            //work out item
            item =((Vector_Int) lines.elementAt(i)).elementAt(currentItem[i]);
            c_x1 = f_x1[item];
            c_x2 = f_x2[item];
            all_done = false;

          } else {
            item = -1;
            c_x1 = -1;
            c_x2 = -1;
          }

          if ((item == -1) & (items_added <= itemsInTable)) {
            //all items in table so just filling in gaps
            rowContents[i].addElement(-1);
           
          } else if ((c_x1 >= x1) & (c_x1 < x2)) {
            //fits into cell so add in and roll on marker

            rowContents[i].addElement(item);
            currentItem[i]++;
           
            items_added++;
          } else if (c_x1 > x2) { //empty cell
            rowContents[i].addElement(-1);
          }
        }
      }
      if (all_done)
        break;
    }

    //===================================================================
    /**
     * now assemble rows
     */
    for (int row = 0; row < max_rows; row++) {
      StringBuffer line_content = new StringBuffer();
     
      int count = rowContents[row].size() - 1;
      master = ((Vector_Int) lines.elementAt(row)).elementAt(0);

      for (i = 0; i < count; i++) {
        item = rowContents[row].elementAt(i);

        if (isXHTML) {

          //get width
          float current_width = widths.elementAt(i);
          String current_alignment = alignments.elementAt(i);
          int test, colspan = 1, pointer = i + 1;

          if (item != -1) {

            //look for colspan
            while (true) {
              test = rowContents[row].elementAt(i + 1);
              if ((test != -1) | (count == i + 1))
                break;

              //break if over another col - roll up single value on line
              if ((itemCount[row] > 1)& (cell_x1.elementAt(i + 1) > f_x2[item]))
                break;

              count--;
              rowContents[row].removeElementAt(i + 1);
              colspan++;

              //update width
              current_width =current_width + widths.elementAt(pointer);
              pointer++;
            }
          }
          line_content.append("<td");

          if (keep_alignment_information) {
            line_content.append(" align='");
            line_content.append(current_alignment);
            line_content.append('\'');
            if (colspan > 1)
                            line_content.append(" colspan='").append(colspan).append('\'');
          }

          if (keep_width_information)
                        line_content.append(" width='").append((int) current_width).append('\'');

          line_content.append(" nowrap>");
          if (item == -1)
            line_content.append(empty_cell);
          else
            line_content.append(content[item]);
          line_content.append("</td>");

        } else { //csv
          if (item == -1) //empty col
            line_content.append("\"\",");
          else{ //value
            line_content.append('\"');
            line_content.append(content[item]);
            line_content.append("\",");
          }
        }

        //merge to update other values
        if ((item != -1) && (master != item)) //merge tracks the shape
          merge(master,item,separator,false);

      }
      //substitute our 'hand coded' value
      content[master] = line_content;

    }
  }

  /**
   * work through data and create a set of rows and return an object with
   * refs for each line
   * @throws PdfException
   */
  private void createLinesInTable(int itemCount, int[] items,boolean addSpaceXMLTag,int mode) throws PdfException {

        /**
     * reverse order if text right to left
     */
    if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
      items=reverse(items);
   
    /**
     * create and populate local copies of arrays
     */
    float[] f_x1,f_x2,f_y1,f_y2;

        // set pointers so always left to right text
        switch(mode){
            case PdfData.HORIZONTAL_LEFT_TO_RIGHT:
      f_x1=this.f_x1;
      f_x2=this.f_x2;
      f_y1=this.f_y1;
      f_y2=this.f_y2;
                break;

            case PdfData.HORIZONTAL_RIGHT_TO_LEFT:
      f_x2=this.f_x1;
      f_x1=this.f_x2;
      f_y1=this.f_y1;
      f_y2=this.f_y2;
                break;

            case PdfData.VERTICAL_BOTTOM_TO_TOP:
      f_x1=this.f_y1;
      f_x2=this.f_y2;
      f_y1=this.f_x2;
      f_y2=this.f_x1;
                break;

            case PdfData.VERTICAL_TOP_TO_BOTTOM:
      f_x1=this.f_y2;
      f_x2=this.f_y1;
      f_y2=this.f_x1;
      f_y1=this.f_x2;
      items = this.getsortedUnusedFragments(false, true);
      items=reverse(items);
                break;

            default:
      throw new PdfException("Illegal value "+mode+"for currentWritingMode");
    }
   
    //holds line we're working on
    Vector_Int current_line;
   
        for (int j = 0; j < itemCount; j++) { //for all items

            int c=items[j],id = -1, i,last = c;
        float smallest_gap = -1, gap, yMidPt;

        if(!isUsed[c] && this.writingMode[c]==mode) {

          //reset pointer and add this element
          current_line = new Vector_Int(20);
          current_line.addElement(c);
          lineY2.addElement((int) f_y2[c]);

                //look for items along same line (already sorted into order left to right)
                while (true) {   //look for a match
                    for (int ii = 0; ii < itemCount; ii++) {

              i = items[ii];

                        if (!isUsed[i] && i!=c && writingMode[c]==mode && ((f_x1[i] > f_x1[c] && mode!=PdfData.VERTICAL_TOP_TO_BOTTOM)||(f_x1[i] < f_x1[c] && mode==PdfData.VERTICAL_TOP_TO_BOTTOM))) { //see if on right

                            gap = (f_x1[i] - f_x2[c]);

                if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
                  gap=-gap;

                            //allow for fp error
                            if (gap < 0 && gap > -2)
                                gap = 0;

                //make sure on right
                yMidPt = (f_y1[i] + f_y2[i]) / 2;

                //see if line & if only or better fit
                            if (yMidPt < f_y1[c] && yMidPt > f_y2[c] && (smallest_gap < 0 || gap < smallest_gap)) {
                  smallest_gap = gap;
                  id = i;
                }
              }
            }

            if (id == -1) //exit when no more matches
              break;

                    //merge in best match if fit found with last or if overlaps by less than half a space,otherwise join
                    float t = f_x1[id] - f_x2[last],possSpace=f_x1[id]-f_x2[c];
                    float av_char1 =(float)1.5 *((f_x2[id] - f_x1[id])/ textLength[id]);
                    float av_char2 =(float)1.5 *((f_x2[last] - f_x1[last]) / textLength[last]);

                    if((mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)){
                        possSpace=-possSpace;
                        t=-t;
                        av_char1=-av_char1;
                        av_char2=-av_char2;
                    }

                    if (t < av_char1 && t < av_char2) {
                        merge(last,id, isGapASpace(id, last, possSpace,addSpaceXMLTag,mode),true);
                    } else {
                        current_line.addElement(id);
                        last = id;
                    }

                    //flag used and reset variables used
                    isUsed[id] = true;
                    id = -1;
                    smallest_gap = 1000000;

                }

                //add line to list
                lines.addElement(current_line);
                max_rows++;
            }
        }
  }

  /**
   *
   * calls various low level merging routines on merge -
   *
   * isCSV sets if output is XHTML or CSV format -
   *
   * XHTML also has options to include font tags (keepFontInfo),
   * preserve widths (keepWidthInfo), try to preserve alignment
   * (keepAlignmentInfo), and set a table border width (borderWidth)
   *  - AddCustomTags should always be set to false
   *
   * @param x1 is the x coord of the top left corner
   * @param y1 is the y coord of the top left corner
   * @param x2 is the x coord of the bottom right corner
   * @param y2 is the y coord of the bottom right corner
   * @param pageNumber is the page you wish to extract from
   * @param isCSV is a boolean. If false the output is xhtml if true the text is out as CSV
   * @param keepFontInfo if true and isCSV is false keeps font information in extrated text.
   * @param keepWidthInfo if true and isCSV is false keeps width information in extrated text.
   * @param keepAlignmentInfo if true and isCSV is false keeps alignment information in extrated text.
   * @param borderWidth is the width of the border for xhtml
   * @return Map containing text found in estimated table cells
   * @throws PdfException If the co-ordinates are not valid
   */
  public final Map extractTextAsTable(
    int x1,
    int y1,
    int x2,
    int y2,
    int pageNumber,
    boolean isCSV,
    boolean keepFontInfo,
    boolean keepWidthInfo,
    boolean keepAlignmentInfo,
    int borderWidth)
    throws PdfException {

    //check in correct order and throw exception if not
    int[] v = validateCoordinates(x1, y1, x2, y2);
    x1 = v[0];
    y1 = v[1];
    x2 = v[2];
    y2 = v[3];
   
    /** return the content as an Element */
    Map table_content = new Hashtable();

    LogWriter.writeLog("extracting Text As Table");

    //flag type of table so we can add correct separators
    if (isCSV == true) {
      isXHTML = false;
    } else {
      isXHTML = true;
    }

    //init table variables
    lines = new Vector_Object(20);
    lineY2 = new Vector_Int(20);
    max_rows = 0;

    //init store for data
    copyToArrays(x1, y2, x2, y1, keepFontInfo, false,true,null,false);

    //initial grouping and delete any hidden text
    removeEncoding();

    //eliminate shadows and also merge overlapping text
    cleanupShadowsAndDrownedObjects(false);

    int[] items = this.getsortedUnusedFragments(true, false);
    int item_count = items.length; //number of items

    if(item_count==0)
      return table_content;
   
    /**
     * check orientation and get preferred. Items not correct will
     * be ignored
     */
    int writingMode=getWritingMode(items,item_count);

    String message ="Table Merging algorithm being applied " + (item_count) + " items";
    LogWriter.writeLog(message);
   
    /**
     * scan all items joining best fit to right of each fragment to build
     * lines
     */
    if (item_count > 1) {

      //workout the raw lines
      createLinesInTable(item_count, items,isXHTML,writingMode);

      /**
       * generate lookup with lines in correct order (minus used to get
       * correct order down the page)
       */
      int dx=1;
      if(writingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT || writingMode==PdfData.VERTICAL_TOP_TO_BOTTOM)
        dx=-1;
     
      line_order = new int[max_rows];
      int[] line_y=new int[max_rows];

      for (int i = 0; i < max_rows; i++) {
        line_y[i] = dx*lineY2.elementAt(i);
        line_order[i] = i;
      }

      line_order = Sorts.quicksort(line_y, line_order);

      //assemble the rows and columns
      createTableRows(keepAlignmentInfo, keepWidthInfo,writingMode);

      //assemble the rows and columns
      mergeTableRows(borderWidth);
     
    }

    content[master]=cleanup(content[master]);
   
    String processed_value = content[master].toString();

    if(processed_value!=null){
     
//      cleanup data if needed by removing duplicate font tokens
      if (!isCSV)
        processed_value = Fonts.cleanupTokens(processed_value);

      table_content.put("content", processed_value);
      table_content.put("x1", String.valueOf(x1));
      table_content.put("x2", String.valueOf(x2));
      table_content.put("y1", String.valueOf(y1));
      table_content.put("y2", String.valueOf(y2));
    }
   
    return table_content;
  }

  /** make sure co-ords valid and throw exception if not */
  private static int[] validateCoordinates(int x1, int y1, int x2, int y2)
    throws PdfException {
    if ((x1 > x2) | (y1 < y2)) {

//      String errorMessage = "Invalid parameters for text rectangle. ";
      if (x1 > x2){
//        errorMessage =
//          errorMessage
//            + "x1 value ("
//            + x1
//            + ") must be LESS than x2 ("
//            + x2
//            + "). ";
        int temp = x1;
        x1 = x2;
        x2 = temp;
        LogWriter.writeLog("x1 > x2, coordinates were swapped to validate");
      }
     
      if (y1 < y2){
//        errorMessage =
//          errorMessage
//            + "y1 value ("
//            + y1
//            + ") must be MORE than y2 ("
//            + y2
//            + "). ";
        int temp = y1;
        y1 = y2;
        y2 = temp;
        LogWriter.writeLog("y1 < y2, coordinates were swapped to validate");
      }
//      throw new PdfException(errorMessage);
    }
    return new int[]{x1,y1,x2,y2};
  }

  /**
   *
   * algorithm to place data from within coordinates to a vector of word, word coords (x1,y1,x2,y2)
   *
   * @param x1 is the x coord of the top left corner
   * @param y1 is the y coord of the top left corner
   * @param x2 is the x coord of the bottom right corner
   * @param y2 is the y coord of the bottom right corner
   * @param page_number is the page you wish to extract from
   * @param breakFragments will divide up text based on white space characters
   * @param punctuation is a string containing all values that should be used to divide up words
   * @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
   * @throws PdfException If the co-ordinates are not valid
   */
  final public Vector extractTextAsWordlist(
    int x1,
    int y1,
    int x2,
    int y2,
    int page_number,
    boolean breakFragments,
    String punctuation)
    throws PdfException {

    /** make sure co-ords valid and throw exception if not */
    int[] v = validateCoordinates(x1, y1, x2, y2);
    x1 = v[0];
    y1 = v[1];
    x2 = v[2];
    y2 = v[3];

    /** extract the raw fragments (Note order or parameters passed) */
    if (breakFragments)
      copyToArrays(x1, y2, x2, y1, true, true,false,punctuation,true);
    else
      copyToArrays();

   
   
    /** delete any hidden text */
    removeEncoding();

    //eliminate shadows and also merge overlapping text
    cleanupShadowsAndDrownedObjects(true);

    int[] items = getsortedUnusedFragments(true, false);
    int count = items.length;

    /**if no values return null
     */
    if(count==0){
      LogWriter.writeLog("Less than 1 text item on page");
     
      return null;
    }
   
    /**
     * check orientation and get preferred. Items not correct will
     * be ignored
     */
    int writingMode=getWritingMode(items,count);

      /**
       * build set of lines from text
       */
      createLines(count, items,writingMode,true,false,false);

      /**
       * alter co-ords to rotated if requested
       */
      float[] f_x1=null,f_x2=null,f_y1=null,f_y2=null;

      if(useUnrotatedCoords || writingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
        f_x1=this.f_x1;
        f_x2=this.f_x2;
        f_y1=this.f_y1;
        f_y2=this.f_y2;
      }else if(writingMode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
        f_x2=this.f_x1;
        f_x1=this.f_x2;
        f_y1=this.f_y1;
        f_y2=this.f_y2;
      }else if(writingMode==PdfData.VERTICAL_BOTTOM_TO_TOP){
        f_x1=this.f_y2;
        f_x2=this.f_y1;
        f_y1=this.f_x2;
        f_y2=this.f_x1;

      }else if(writingMode==PdfData.VERTICAL_TOP_TO_BOTTOM){
        f_x1=this.f_y1;
        f_x2=this.f_y2;
        f_y2=this.f_x1;
        f_y1=this.f_x2;
      }

    /** put into a Vector */
    Vector values = new Vector();
     
      for (int i = 0; i < content.length; i++) {
        if (content[i] != null) {

//          System.out.println(">>>>>"+content[i]);

          if((colorExtracted)&&(isXMLExtraction)){
            if(!content[i].toString().toLowerCase().startsWith(GenericColorSpace.cb)){
              content[i].insert(0,f_colorTag[master]);
            }
            if(!content[i].toString().toLowerCase().endsWith(GenericColorSpace.ce)){
              content[i].append(GenericColorSpace.ce);
            }
          }

          if(isXMLExtraction)
            values.add((content[i]).toString());
          else
            values.add(Strip.convertToText((content[i]).toString(), isXMLExtraction));

          if((!useUnrotatedCoords)&&(writingMode==PdfData.VERTICAL_TOP_TO_BOTTOM)){
            values.add(String.valueOf(f_x1[i]));
            values.add(String.valueOf(f_y1[i]));
            values.add(String.valueOf(f_x2[i]));
            values.add(String.valueOf(f_y2[i]));
          }else if((!useUnrotatedCoords)&&(writingMode==PdfData.VERTICAL_BOTTOM_TO_TOP)){
            values.add(String.valueOf(f_x1[i]));
            values.add(String.valueOf(f_y2[i]));
            values.add(String.valueOf(f_x2[i]));
            values.add(String.valueOf(f_y1[i]));
          }else
            values.add(String.valueOf(f_x1[i]));
            values.add(String.valueOf(f_y1[i]));
            values.add(String.valueOf(f_x2[i]));
            values.add(String.valueOf(f_y2[i]));
          }
        }
      }

    LogWriter.writeLog("Text extraction as wordlist completed");
   
    return values;
   
  }

    /**
     * reset global values
     */
    private void reset(){

        isXHTML = true;
        nextSlot=0;

      lineBreaks = new Vector_Int();

        max_rows = 0;
        master = 0;

        colorExtracted=false;

    }

    /**
   * algorithm to place data from specified coordinates on a page into a String.
   *
   * @param x1 is the x coord of the top left corner
   * @param y1 is the y coord of the top left corner
   * @param x2 is the x coord of the bottom right corner
   * @param y2 is the y coord of the bottom right corner
   * @param page_number is the page you wish to extract from
   * @param estimateParagraphs will attempt to find paragraphs and add new lines in output if true
   * @param breakFragments will divide up text based on white space characters if true
   * @return Vector containing words found and words coordinates (word, x1,y1,x2,y2...)
   * @throws PdfException If the co-ordinates are not valid
   */
  final public String extractTextInRectangle(
    int x1,
    int y1,
    int x2,
    int y2,
    int page_number,
    boolean estimateParagraphs,
    boolean breakFragments)
    throws PdfException {


        reset();

        if((breakFragments)&&(!pdf_data.IsEmbedded()))
              throw new PdfException("[PDF] Request to breakfragments and width not added. Please add call to init(true) of PdfDecoder to your code.");
 
    /** make sure co-ords valid and throw exception if not */
    int[] v = validateCoordinates(x1, y1, x2, y2);
    x1 = v[0];
    y1 = v[1];
    x2 = v[2];
    y2 = v[3];
 
    int master, count;
 
    /** extract the raw fragments (Note order or parameters passed) */
    if (breakFragments)
      copyToArrays(x1, y2, x2, y1, (isXMLExtraction), false,false,null,false);
    else
      copyToArrays();
   
    /**
     * delete any hidden text
     */
    removeEncoding();
   
    /**
    * eliminate shadows and also merge overlapping text
    */
    cleanupShadowsAndDrownedObjects(false);
   
    /** get the fragments as an array */
    int[] items = getsortedUnusedFragments(true, false);
    count = items.length;

    /**if no values return null
     */
    if(count==0){
      LogWriter.writeLog("Less than 1 text item on page");
     
      return null;
    }
   
    /**
     * check orientation and get preferred. Items not correct will
     * be ignored
     */
    int writingMode=getWritingMode(items,count);
     
      /**
       * build set of lines from text
       */
      createLines(count, items,writingMode,false,isXMLExtraction,false);

        /**
               * roll lines together
               */
     
      master = mergeLinesTogether(writingMode,estimateParagraphs,x1,x2,y1,y2);

      /**
       * add final deliminators
       */
      if(isXMLExtraction){
        content[master] =new StringBuffer(Fonts.cleanupTokens(content[master].toString()));
        content[master].insert(0,"<p>");
        content[master].append("</p>");
      }
     
    LogWriter.writeLog("Text extraction completed");

    return cleanup(content[master]).toString();

  }
 
 
  private StringBuffer cleanup(StringBuffer buffer) {
   
    if(buffer==null)
      return buffer;

         /**
        if(PdfDecoder.inDemo){
            int icount=buffer.length(),count=0;
            boolean inToken=false;
            for(int i=0;i<icount;i++){
                char c=buffer.charAt(i);
                if(c=='<')
                    inToken=true;
                else if(c=='>')
                    inToken=false;
                else if((c!=' ')&&(!inToken)){
                    count++;
                    if(count>4){
                        count=0;
                        buffer.setCharAt(i,'1');
                    }
                }
            }
    }
    /**/

        //sort out & to &amp;
        if(isXMLExtraction){
            String buf=buffer.toString();

            buf=buf.replaceAll("&#","XX#");
            buf=buf.replaceAll("&lt","XXlt");
            buf=buf.replaceAll("&gt","XXgt");

            buf=buf.replaceAll("&","&amp;");

            //put back others
            buf=buf.replaceAll("XX#", "&#");
            buf=buf.replaceAll("XXlt", "&lt");
            buf=buf.replaceAll("XXgt","&gt");

            boolean removeInvalidXMLValues = true;
            if (removeInvalidXMLValues) {
           
              /**
         * Restricted Char ::=
         *  [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]
         *  [#x1-#x8] | [#x11-#x12] | [#x14-#x31] | [#x127-#x132] | [#x134-#x159]
         */
     
        /** set mappings */
        Map asciiMappings = new HashMap();
        /** [#x1-#x8] */
        for (int i = 1; i <= 8; i++)
          asciiMappings.put("&#" + i + ';', "");
       
        /** [#x11-#x12] */
        for (int i = 11; i <= 12; i++)
          asciiMappings.put("&#" + i + ';', "");
       
        /** [#x14-#x31] */
        for (int i = 14; i <= 31; i++)
          asciiMappings.put("&#" + i + ';', "");
       
        /** [#x127-#x132] */
        //for (int i = 127; i <= 132; i++)
          //asciiMappings.put("&#" + i + ";", "");
       
        /** [#x134-#x159] */
        //for (int i = 134; i <= 159; i++)
          //asciiMappings.put("&#" + i + ";", "");
       
       
        /** substitute illegal XML characters for mapped values */
                for (Object o : asciiMappings.keySet()) {
                    String character = (String) o;
                    String mappedCharacter = (String) asciiMappings.get(character);

                    buf = buf.replace(character, mappedCharacter);
                }
      }
      buffer=new StringBuffer(buf);
        }
       
        return buffer;
  }

  /**
   * scan fragments and detect orientation. If multiple,
   * prefer horizontal
   */
    private int getWritingMode(int[] items, int count) {

        /**
         * get first value
         */
        int orientation=writingMode[items[0]];

        //exit if first is horizontal
        if(orientation==PdfData.HORIZONTAL_LEFT_TO_RIGHT || orientation==PdfData.HORIZONTAL_RIGHT_TO_LEFT)
            return orientation;

        /**
         * scan items looking at orientation - exit if we find horizontal
         */
        for (int j = 1; j < count; j++) {

            int c=items[j];

            if (!isUsed[c]) {

                if(writingMode[c]==PdfData.HORIZONTAL_LEFT_TO_RIGHT || writingMode[c]==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
                    orientation=writingMode[c];
                    j=count;
                    LogWriter.writeLog("Text of multiple orientations found. Only horizontal text used.");
                }
            }
        }

        return orientation;
    }

  /**
   * @param estimateParagraphs
   * @return
   * @throws PdfException
   */
  private int mergeLinesTogether(int currentWritingMode,boolean estimateParagraphs, int x1,int x2,int y1,int y2) throws PdfException {

        String separator;
   
    int[] indices;
   
    //used for working out alignment
    int middlePage;
   
    /**
     * create local copies of
     */
    float[] f_x1,f_x2,f_y1,f_y2;
   
    if(currentWritingMode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
      f_x1=this.f_x1;
      f_x2=this.f_x2;
      f_y1=this.f_y1;
      f_y2=this.f_y2;
      indices = getsortedUnusedFragments(false, true);
      middlePage = (x1 + x2) / 2;
    }else if(currentWritingMode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
      f_x2=this.f_x1;
      f_x1=this.f_x2;
      f_y1=this.f_y1;
      f_y2=this.f_y2;
      indices = getsortedUnusedFragments(false, true);
      middlePage = (x1 + x2) / 2;
    }else if(currentWritingMode==PdfData.VERTICAL_BOTTOM_TO_TOP){
      f_x1=this.f_y1;
      f_x2=this.f_y2;
      f_y1=this.f_x2;
      f_y2=this.f_x1;
      indices = getsortedUnusedFragments(true, true);

      indices=reverse(indices);
      middlePage = (y1 + y2) / 2;
     
    }else if(currentWritingMode==PdfData.VERTICAL_TOP_TO_BOTTOM){
      f_x1=this.f_y2;
      f_x2=this.f_y1;
      f_y2=this.f_x2;
      f_y1=this.f_x1;
      indices = getsortedUnusedFragments(true, true);
      middlePage = (y1 + y2) / 2;
    }else{
      throw new PdfException("Illegal value "+currentWritingMode+"for currentWritingMode");
    }
    int quarter = middlePage / 2;
    int count = indices.length;
    int master = indices[count - 1];
 
    /**
     * now loop through all lines merging
     */
    StringBuffer child_textX=null;
    String master_textX=null;
    int ClastChar,MlastChar,CFirstChar;
    final boolean debug=false;
    for (int i = count - 2; i > -1; i--) {
     
      int child = indices[i];
      separator = "";
     
        /** add formatting in to retain structure */
        //text to see if lasts ends with . and next starts with capital

        //-1 if no chars
        ClastChar=getLastChar(content[child]);
        if(debug){

          CFirstChar=getFirstChar(content[child]);
          MlastChar=getLastChar(content[master]);

          child_textX = Strip.stripXML(content[child],isXMLExtraction);
          master_textX =Strip.stripXML(content[master],isXMLExtraction).toString();

        }

        if (ClastChar!=-1) {
         
          addAlignmentFormatting(estimateParagraphs, middlePage, f_x1, f_x2, quarter, child);

          //see if we insert a line break and merge
          String lineSpace = "</p>"+SystemSeparator+"<p>";
          if(isXMLExtraction)
            lineSpace=SystemSeparator;

          float gap = f_y2[master] - f_y1[child];
          float line_height = f_y1[child] - f_y2[child];
          if(currentWritingMode==PdfData.VERTICAL_BOTTOM_TO_TOP){
            gap = -gap;
            line_height = -line_height;
          }

          if ((gap > line_height)&(line_height>0)) { //add in line gaps

            while (gap > line_height) {
              separator = separator + lineSpace;
              gap = gap - line_height;
            }

            if(isXMLExtraction)
              separator = separator + "</p>"+SystemSeparator+"<p>";
            else
              separator=SystemSeparator;

          } else if (estimateParagraphs == true) {

            CFirstChar=getFirstChar(content[child]);
            MlastChar=getLastChar(content[master]);

            if ((((MlastChar=='.'))|| (((MlastChar=='\"'))))&&((CFirstChar>='A')&& (CFirstChar<='Z'))){
              if(isXMLExtraction)
                separator = "</p>"+SystemSeparator+"<p>";
              else
                separator=SystemSeparator;
            }

          }else{
            if(isXMLExtraction){
              content[child].insert(0, "</p>"+SystemSeparator+"<p>");
            }else
              content[master].append(SystemSeparator);
          }

          merge(master,child,separator,false);

      }
  }
    return master;
  }

  private int getFirstChar(StringBuffer buffer) {
   
    int i=-1;
    boolean inTag=false;
    int count=buffer.length();
    char openChar=' ';
    int ptr=0;
   
    while(ptr<count){
      char nextChar=buffer.charAt(ptr);
     
      if((!inTag)&&((nextChar=='<')||(isXMLExtraction && nextChar=='&'))){
        inTag=true;
        openChar=nextChar;
       
        //trap & .... &xx; or other spurious
        if((openChar=='&')){
          if((ptr+1)==count){
            i='&';
            ptr=count;
          }else{
            char c=buffer.charAt(ptr+1);
           
            if((c!='#')&&(c!='g')&&(c!='l')){
              i='&';
              ptr=count;
            }
          }
        }
      }
     
      if((!inTag)&&(nextChar!=' ')){
        i=nextChar;
        ptr=count;
      }
     
      //allow for valid & in stream
      if((inTag)&&(openChar=='&')&&(nextChar==' ')){
        i=openChar;
        ptr=count;
      }else if((inTag)&&((nextChar=='>')||(isXMLExtraction && openChar=='&' && nextChar==';'))){
       
        //put back < or >
        if((nextChar==';')&&(openChar=='&')&&(ptr>2)&(buffer.charAt(ptr-1)=='t')){
          if((buffer.charAt(ptr-2)=='l')){
            i='<';
            ptr=count;
          }else if((buffer.charAt(ptr-2)=='g')){
            i='>';
            ptr=count;
          }
        }
       
        inTag=false;
      }
     
      ptr++;
    }
   
    return i;
  }

  /**return char as int or -1 if no match*/
  private int getLastChar(StringBuffer buffer) {
   
    int i=-1;
    boolean inTag=false;
    int count=buffer.length();
    int size=count;
    char openChar=' ';
    count--; //knock 1 off so points to last char
   
    while(count>-1){
      char nextChar=buffer.charAt(count);
     
      //trap &xx;;
      if(inTag && openChar==';' && nextChar==';'){
        i=';';
        count=-1;
      }
     
      if(!inTag &&(nextChar=='>'||(isXMLExtraction && nextChar==';'))){
        inTag=true;
               
                //check it is a token and not just > at end
                int lastTokenStart=buffer.lastIndexOf("</"); //find start of this tag if exists
                if(lastTokenStart==-1){ //no tag so ignore
                    inTag=false;
                   
                }else{ //see if real token by looking for invalid chars inside and reject if found
                   char charToTest;
                   for(int ptr=lastTokenStart;ptr<count;ptr++){
                       charToTest=buffer.charAt(ptr);
                       if(charToTest==' ' || charToTest=='>'){
                           inTag=false;
                           ptr=count;
                       }
                   }
                }
               
                if(inTag)               
            openChar=nextChar;
                else{
                    i=nextChar;
                    count=-1;
                }
      }
     
      if(!inTag && nextChar!=32){
        i=nextChar;
        count=-1;
      }
     
      if(nextChar=='<' ||(isXMLExtraction && openChar==';' && nextChar=='&')){
        inTag=false;
       
        //put back < or >
        if((nextChar=='&')&&(count+3<size)&(buffer.charAt(count+2)=='t')&&(buffer.charAt(count+3)==';')){
          if((buffer.charAt(count+1)=='l')){
            i='<';
            count=-1;
          }else if((buffer.charAt(count+1)=='g')){
            i='>';
            count=-1;
          }
        }
      }
     
      if(inTag && openChar==';' && nextChar==' '){
        count=-1;
        i=';';
      }
      count--;
    }
   
    return i;
  }

  /**
   * reverse order in matrix so back to front
   */
  private static int[] reverse(int[] indices) {
    int count =indices.length;
    int[] newIndex=new int[count];
    for(int i=0;i<count;i++){
      newIndex[i]=indices[count-i-1];
    }
    return newIndex;
  }

  /**
   * used to add LEFT,CENTER,RIGHT tags into XML when extracting text
   */
  private void addAlignmentFormatting(boolean estimateParagraphs, int middlePage, float[] f_x1, float[] f_x2, int quarter, int child) {
    //put in some alignment
    float left_gap = middlePage - f_x1[child];
    float right_gap = f_x2[child] - middlePage;
    if ((!estimateParagraphs)&&(isXMLExtraction)&&
        (left_gap > 0)&& (right_gap > 0)&& (f_x1[child] > quarter)&& (f_x1[child] < (middlePage + quarter))) {
     
      float ratio = left_gap / right_gap;
      if (ratio > 1)
        ratio = 1 / ratio;
     
      if (ratio > 0.95){  //add centring if seems centered around middle
        content[child] =new StringBuffer(Fonts.cleanupTokens(content[child].toString()));
        content[child].insert(0,"<center>");
        content[child].append("</center>\n");
      }else if ((right_gap < 10) & (left_gap > 30)){  //add right align
        content[child] =new StringBuffer(Fonts.cleanupTokens(content[child].toString()));
        content[child].insert(0,"<right>");
        content[child].append("</right>\n");
         
      }
    }
  }

  /**
   * convert fragments into lines of text
   */
  /**
   * convert fragments into lines of text
   */
  private void createLines(int count, int[] items,int mode,boolean breakOnSpace,boolean addMultiplespaceXMLTag,boolean sameLineOnly) throws PdfException{
   
    String separator;

    final boolean debug=false;

    /**
     * create local copies of arrays
     */
    float[] f_x1,f_x2,f_y1,f_y2;

    /**
     * reverse order if text right to left
     */
    if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
      items=reverse(items);

    /**
     * set pointers so left to right text
     */
    if(mode==PdfData.HORIZONTAL_LEFT_TO_RIGHT){
      f_x1=this.f_x1;
      f_x2=this.f_x2;
      f_y1=this.f_y1;
      f_y2=this.f_y2;
    }else if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT){
      f_x2=this.f_x1;
      f_x1=this.f_x2;
      f_y1=this.f_y1;
      f_y2=this.f_y2;
    }else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP){
      f_x1=this.f_y1;
      f_x2=this.f_y2;
      f_y1=this.f_x2;
      f_y2=this.f_x1;
    }else if(mode==PdfData.VERTICAL_TOP_TO_BOTTOM){
      f_x1=this.f_y2;
      f_x2=this.f_y1;
      f_y2=this.f_x1;
      f_y1=this.f_x2;
    }else{
      throw new PdfException("Illegal value "+mode+"for currentWritingMode");
    }

    /**
     * scan items joining best fit to right of each fragment to build
     * lines. This is tedious and processor intensive but necessary as the
     * order cannot be guaranteed
     */
    for (int j = 0; j < count; j++) {
     
      int id = -1, i;
      int c=items[j];
     
      float smallest_gap = -1, gap, yMidPt;
      if(!isUsed[c] && this.writingMode[c]==mode) {
       
        if(debug)
          System.out.println("Look for match with "+removeHiddenMarkers(content[c].toString()));

        while (true) {
          for (int j2 = 0; j2 < count; j2++) {
            i=items[j2];

            if(isUsed[i] == false){

              //amount of variation in bottom of text
              int baseLineDifference = (int) (f_y2[i] - f_y2[c]);
              if (baseLineDifference < 0)
                baseLineDifference = -baseLineDifference;
             
              //amount of variation in bottom of text
              int topLineDifference = (int) (f_y1[i] - f_y1[c]);
              if (topLineDifference < 0)
                topLineDifference = -topLineDifference;

              // line gap
              int lineGap = (int) (f_x1[i] - f_x2[c]);
             
              //Check if fragments are closer from the other end
              if(lineGap>(int) (f_x1[c] - f_x2[i]))
                lineGap = (int) (f_x1[c] - f_x2[i]);
             
              int fontSizeChange=fontSize[c]-fontSize[i];
              if(fontSizeChange<0)
                fontSizeChange=-fontSizeChange;

              if(debug)
                System.out.println("Against "+removeHiddenMarkers(content[i].toString()));

              if(sameLineOnly && lineGap>fontSize[c] && lineGap>0){ //ignore text in wrong order allowing slight margin for error
                // allow for multicolumns with gap

                if(debug)
                  System.out.println("case1 lineGap="+lineGap);
//              //Case removed as it broke one file and had no effect on other files
//              }else if (sameLineOnly && (lineGap > (fontSize[c]*10)|| lineGap > (fontSize[i]*10)) ) { //JUMP IN TEXT SIZE ACROSS COL
//                //ignore
//
//                if(debug)
//                  System.out.println("case2");
              }else if (sameLineOnly && baseLineDifference > 1 && lineGap > 2 * fontSize[c] && (fontSize[c] == fontSize[i])) { //TEXT SLIGHTLY OFFSET
                //ignore
                if(debug)
                  System.out.println("case3");
              }else if(sameLineOnly && baseLineDifference>3){
                //ignore
                if(debug)
                  System.out.println("case4");
              }else if(sameLineOnly && fontSizeChange>2){
                //ignore
                if(debug)
                  System.out.println("case5");
              }else if (i!=c &&((f_x1[i] > f_x1[c] && mode!=PdfData.VERTICAL_TOP_TO_BOTTOM)||
                  f_x1[i] < f_x1[c] && mode==PdfData.VERTICAL_TOP_TO_BOTTOM && writingMode[c]==mode
                  && (!(fontSizeChange>2) || (fontSizeChange>2 && topLineDifference<3))
                  )) { //see if on right

                gap = (f_x1[i] - f_x2[c]);

                if(debug)
                  System.out.println("case6 gap="+gap);

                if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
                  gap=-gap;

                //allow for fp error
                if ((gap < 0) && (gap > -2))
                  gap = 0;

                //make sure on right
                yMidPt = (f_y1[i] + f_y2[i]) / 2;

                //see if line & if only or better fit
                if ((yMidPt < f_y1[c])&& (yMidPt > f_y2[c])&&((smallest_gap < 0)|| (gap < smallest_gap))) {
                  smallest_gap = gap;
                  id = i;
               
              }
            }
          }

          //merge on next right item or exit when no more matches
          if (id == -1)
            break;

          float possSpace=f_x1[id]-f_x2[c];         
            if(mode==PdfData.HORIZONTAL_RIGHT_TO_LEFT || mode==PdfData.VERTICAL_TOP_TO_BOTTOM)
              possSpace=-possSpace;
                    else if(mode==PdfData.VERTICAL_BOTTOM_TO_TOP)
                        possSpace=(f_x2[id]-f_x1[c]);
                       
          //add space if gap between this and last object
          separator =isGapASpace(c,id,possSpace,addMultiplespaceXMLTag,mode);
         
          /** merge if adjoin */
          if ((breakOnSpace)&&(hadSpace!=null)&&((hadSpace[c])||(separator.startsWith(" "))))
            break;
         
          merge(c,id,separator,true);



          id = -1; //reset
          smallest_gap = 1000000; //and reset the gap

        }
      }
    }
  }

  static class ResultsComparator implements Comparator {
    private int rotation;
   
    public ResultsComparator(int rotation) {
      this.rotation = rotation;
    }
   
    public int compare(Object o1, Object o2) {
      Rectangle[] ra1;
      Rectangle[] ra2;

      if(o1 instanceof Rectangle[]){
        ra1 = (Rectangle[]) o1;
      }else
        ra1 = new Rectangle[]{(Rectangle) o1};

      if(o2 instanceof Rectangle[]){
        ra2 = (Rectangle[]) o2;
      }else     
        ra2 = new Rectangle[]{(Rectangle) o2};

      for(int i=0; i!=ra1.length; i++)
        for(int j=0; j!=ra2.length; j++){ //do we need this loop?
          Rectangle r1 = ra1[i];
          Rectangle r2 = ra2[j];

          switch(rotation){
          case   0:
            if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left
              if (r1.x > r2.x)
                return 1;
              else
                return -1;
            } else if (r1.y > r2.y) { // the first word is above the second, so pick the first
              return -1;
            }

            return 1;// the second word is above the first, so pick the second
           
          case  90:
            if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left
              if (r1.y > r2.y)
                return 1;
              else
                return -1;
            } else if (r1.x > r2.x) // the first word is above the second, so pick the first
              return 1;

            return -1; // the second word is above the first, so pick the second
           
          case 180:
            if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left
              if (r1.x > r2.x)
                return 1;
              else
                return -1;
            } else if (r1.y > r2.y) { // the first word is above the second, so pick the first
              return -1;
            }

            return 1;// the second word is above the first, so pick the second
           
          case 270:
            if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left
              if (r1.y > r2.y)
                return 1;
              else
                return -1;
            } else if (r1.x < r2.x) // the first word is above the second, so pick the first
              return 1;

            return -1; // the second word is above the first, so pick the second
          }
         
         
          //Orginal code kept incase of mistake.
//          if (rotation == 0 || rotation == 180) {
//            if (r1.y == r2.y) { // the two words on on the same level so pick the one on the left
//              if (r1.x > r2.x)
//                return 1;
//              else
//                return -1;
//            } else if (r1.y > r2.y) { // the first word is above the second, so pick the first
//              return -1;
//            }
//
//            return 1; // the second word is above the first, so pick the second
//          }
//          else { // rotation == 90 or 270
//            if (r1.x == r2.x) { // the two words on on the same level so pick the one on the left
//              if (r1.y > r2.y)
//                return 1;
//              else
//                return -1;
//            } else if (r1.x > r2.x) // the first word is above the second, so pick the first
//              return 1;
//
//            return -1; // the second word is above the first, so pick the second
//          }
        }
      return -1; // the second word is above the first, so pick the second
    }
  }
 
  //<link><a name="findMultipleTermsInRectangleWithMatchingTeasers" />
  /**
   * Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on <b>page_number</b>, with matching teaser
   *
   * @param x1 the left x cord
   * @param y1 the upper y cord
   * @param x2 the right x cord
   * @param y2 the lower y cord
   * @param rotation the rotation of the page to be searched
   * @param page_number the page number to search on
   * @param terms the terms to search for
   * @param searchType searchType the search type made up from one or more constants obtained from the SearchType class
   * @param listener an implementation of SearchListener is required, this is to enable searching to be cancelled
   * @return a SortedMap containing a collection of Rectangle describing the location of found text, mapped to a String
   * which is the matching teaser
   * @throws PdfException If the co-ordinates are not valid
   */
  public SortedMap findMultipleTermsInRectangleWithMatchingTeasers(int x1, int y1, int x2, int y2, final int rotation,
      int page_number, String[] terms, int searchType, SearchListener listener) throws PdfException {
   
    usingMultipleTerms = true;
    multipleTermTeasers.clear();
    teasers = null;
   
    boolean origIncludeTease = includeTease;
   
    includeTease = true;
   
    List highlights = findMultipleTermsInRectangle(x1, y1, x2, y2, page_number, terms, searchType, listener);

    SortedMap highlightsWithTeasers = new TreeMap(new ResultsComparator(rotation));
   
    for (int i = 0; i < highlights.size(); i++) {

      /*highlights.get(i) is a rectangle or a rectangle[]*/
      highlightsWithTeasers.put(highlights.get(i),  multipleTermTeasers.get(i));
    }

    usingMultipleTerms = false;
   
    includeTease = origIncludeTease;
   
    return highlightsWithTeasers;
  }
 
  //<link><a name="findMultipleTermsInRectangle" />
  /**
   * Algorithm to find multiple text terms in x1,y1,x2,y2 rectangle on <b>page_number</b>.
   *
   * @param x1 the left x cord
   * @param y1 the upper y cord
   * @param x2 the right x cord
   * @param y2 the lower y cord
   * @param rotation the rotation of the page to be searched
   * @param page_number the page number to search on
   * @param terms the terms to search for
   * @param orderResults if true the list that is returned is ordered to return the resulting rectangles in a
   * logical order descending down the page, if false, rectangles for multiple terms are grouped together.
   * @param searchType searchType the search type made up from one or more constants obtained from the SearchType class
   * @param listener an implementation of SearchListener is required, this is to enable searching to be cancelled
   * @return a list of Rectangle describing the location of found text
   * @throws PdfException If the co-ordinates are not valid
   */
  public List findMultipleTermsInRectangle(int x1, int y1, int x2, int y2, final int rotation,
      int page_number, String[] terms, boolean orderResults, int searchType, SearchListener listener) throws PdfException {
   
    usingMultipleTerms = true;
    multipleTermTeasers.clear();
    teasers = null;
   
    List highlights = findMultipleTermsInRectangle(x1, y1, x2, y2, page_number, terms, searchType, listener);
   
    if (orderResults) {
      Collections.sort(highlights, new ResultsComparator(rotation));
    }
   
    usingMultipleTerms = false;
   
    return highlights;
  }

  private List findMultipleTermsInRectangle(int x1, int y1, int x2, int y2, int page_number, String[] terms, int searchType,
      SearchListener listener) throws PdfException {
   
        List list = new ArrayList();

        for (String term : terms) {
            if (listener != null && listener.isCanceled()) {
//        System.out.println("RETURNING EARLY");
                break;
            }

            float[] co_ords;

            co_ords = findText(new Rectangle(x1, y1, x2, y2), page_number, new String[]{term}, searchType);

            if (co_ords != null) {
                int count = co_ords.length;
                for (int ii = 0; ii < count; ii = ii + 5) {

                    int wx1 = (int) co_ords[ii];
                    int wy1 = (int) co_ords[ii + 1];
                    int wx2 = (int) co_ords[ii + 2];
                    int wy2 = (int) co_ords[ii + 3];

                    Rectangle rectangle = new Rectangle(wx1, wy2, wx2 - wx1, wy1 - wy2);

                    int seperator = (int) co_ords[ii + 4];

                    if (seperator == linkedSearchAreas) {
                        Vector_Rectangle vr = new Vector_Rectangle();
                        vr.addElement(rectangle);
                        while (seperator == linkedSearchAreas) {
                            ii = ii + 5;
                            wx1 = (int) co_ords[ii];
                            wy1 = (int) co_ords[ii + 1];
                            wx2 = (int) co_ords[ii + 2];
                            wy2 = (int) co_ords[ii + 3];
                            seperator = (int) co_ords[ii + 4];
                            rectangle = new Rectangle(wx1, wy2, wx2 - wx1, wy1 - wy2);
                            vr.addElement(rectangle);
                        }
                        vr.trim();
                        list.add(vr.get());
                    } else {
                        list.add(rectangle);
                    }
                }
            }
        }
    return list;
  }



  //<link><a name="findTextInRectangle" />
  /**
   * Method to find text in the specified area allowing for the text to be split across multiple lines.</br>
   * @param searchArea = Area on page to search. If null search whole page
   * @param page_number = the current page to search
   * @param terms = the text to search for
   * @param searchType = info on how to search the pdf
   * @return the coords of the found text in a float[] where the coords are pdf page coords.
   * The origin of the coords is the bottom left hand corner (on unrotated page) organised in the following order.</br>
   * [0]=result x1 coord</br>
   * [1]=result y1 coord</br>
   * [2]=result x2 coord</br>
   * [3]=result y2 coord</br>
   * [4]=either -101 to show that the next text area is the remainder of this word on another line else any other value is ignored.</br>
   * @throws PdfException
   */
  final public float[] findText(
      Rectangle searchArea,
      int page_number,
      String[] terms,
      int searchType)
  throws PdfException {

    //Failed to supply search terms to do nothing
    if (terms == null)
      return new float[]{};

    //Flags to control the different search options
    boolean firstOccuranceOnly = false;
    boolean wholeWordsOnly = false;
    boolean foundFirst = false;
    boolean useRegEx = false;

    //Search result and teaser holders
    Vector_Float resultCoords = new Vector_Float(0);
    Vector_String resultTeasers = new Vector_String(0);

    //Extract the text data into local arrays for searching
    copyToArrays();

    //Remove any hidden text on page as should not be found
    cleanupShadowsAndDrownedObjects(false);

    //Get unused text objects and sort them for correct searching
    int[] items = getsortedUnusedFragments(true, false);

    /**
     * check orientation and get preferred. Items not correct will be
     * ignored
     */
    int l2r = 0;
    int r2l = 0;
    int t2b = 0;
    int b2t = 0;

    for(int i=0; i!=items.length; i++){
      switch(writingMode[items[i]]){
      case 0 :l2r++; break;
      case 1 :r2l++; break;
      case 2 :t2b++; break;
      case 3 :b2t++; break;     
      }
    }

    int[] unsorted = new int[]{l2r, r2l, t2b, b2t};
    int[] sorted = new int[]{l2r, r2l, t2b, b2t};

    //Set all to -1 so we can tell if it's been set yet
    int[] writingModes = new int[]{-1,-1,-1,-1};

    Arrays.sort(sorted);

    for(int i=0; i!= unsorted.length; i++){
      for(int j=0; j < sorted.length; j++){
        if(unsorted[i]==sorted[j]){

          int pos = j - 3;
          if(pos<0)
            pos=-pos;

          if(writingModes[pos]==-1){
            writingModes[pos] = i;
            j=sorted.length;
          }
        }
      }
    }

    for(int u=0; u!=writingModes.length; u++){

      int writingMode = writingModes[u];

      //if not lines for writing mode, ignore
      if(unsorted[writingMode]!=0){

        //Merge text fragments into lines as displayed on page
        createLines(items.length, items, writingMode, true, false, true);

        //Bitwise flags for regular expressions engine, options always required
        int options = 0;

        //Turn on case sensitive mode
        if((searchType & SearchType.CASE_SENSITIVE) != SearchType.CASE_SENSITIVE){
          options =(options | Pattern.CASE_INSENSITIVE);
        }

        //Only find first occurance of each search term
        if((searchType & SearchType.FIND_FIRST_OCCURANCE_ONLY) == SearchType.FIND_FIRST_OCCURANCE_ONLY){
          firstOccuranceOnly = true;
        }

        //Only find whole words, not partial words
        if((searchType & SearchType.WHOLE_WORDS_ONLY) == SearchType.WHOLE_WORDS_ONLY){
          wholeWordsOnly = true;
        }

        //Allow search to find split line results
        if((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS){
          options =(options | Pattern.MULTILINE | Pattern.DOTALL);
        }

        //Allow the use of regular expressions symbols
        if((searchType & SearchType.USE_REGULAR_EXPRESSIONS) == SearchType.USE_REGULAR_EXPRESSIONS){
          useRegEx = true;
        }

        /**
         * create local copies of arrays
         */
        float[] f_y1 = this.f_y1, f_y2 = this.f_y2;

        /**
         * swap around x and y so rountine works on all cases
         */
        boolean valuesSwapped = false;
        if (writingMode == PdfData.HORIZONTAL_LEFT_TO_RIGHT) {
          f_y1 = this.f_y1;
          f_y2 = this.f_y2;
        } else if (writingMode == PdfData.HORIZONTAL_RIGHT_TO_LEFT) {
          f_y1 = this.f_y1;
          f_y2 = this.f_y2;
        } else if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
          f_y1 = this.f_x2;
          f_y2 = this.f_x1;
          valuesSwapped = true;
        } else if (writingMode == PdfData.VERTICAL_TOP_TO_BOTTOM) {
          f_y2 = this.f_x1;
          f_y1 = this.f_x2;
          valuesSwapped = true;
        }

        //Portions of text to perform the search on and find teasers
        String[] searchText;
        String[] coordsText;


        //Merge all text into one with \n line separators
        //This will allow checking for multi line split results
        String plain = "";
        String raw = "";
        for(int i=0; i!=content.length; i++){
          if(content[i]!=null && writingMode == this.writingMode[i]){

            raw += content[i] +"\n";
            plain += content[i] +"\n";
          }
        }

        //Remove double spaces, replacing them with single spaces
        raw = removeDuplicateSpaces(raw);
        plain = removeDuplicateSpaces(plain);

        //Strip xml from content and keep coords and text data
        raw = Strip.stripXML(raw,isXMLExtraction).toString();

        //Strip xml and coords data from content and keep text data
        plain = removeHiddenMarkers(plain);
        plain = Strip.stripXML(plain,isXMLExtraction).toString();

        //Store text in the search and teaser arrays
        searchText = new String[]{plain};
        coordsText = new String[]{raw};

        //Hold starting point data at page rotation
        Point resultStart;

        //Work through the search terms one at a time
        for(int j=0; j!=terms.length; j++){

          String searchValue = terms[j];

          //Set the default separator between words in a search term
          String sep = " ";

          //Multiline needs space or newline to be recognised as word separators
          if((searchType & SearchType.MUTLI_LINE_RESULTS) == SearchType.MUTLI_LINE_RESULTS){
            sep = "[ \\\\n]";
          }

          //if not using reg ex add reg ex literal flags around the text and word separators
          if(!useRegEx){
            searchValue = "\\Q"+searchValue+"\\E";
            sep = "\\\\E"+sep+"\\\\Q";
          }

          //If word seperator has changed, replace all spaces with modified seperator
          if(!sep.equals(" ")){
            searchValue = searchValue.replaceAll(" ", sep);
          }

          //Surround search term with word boundry tags to match whole words
          if(wholeWordsOnly)
            searchValue = "\\b"+searchValue+"\\b";

          //Create pattern to match search term
          Pattern searchTerm = Pattern.compile(searchValue, options);

          //Create pattern to match search term with two words before and after
          Pattern teaserTerm = Pattern.compile("(?:\\S+\\s)?\\S*(?:\\S+\\s)?\\S*"+searchValue+"\\S*(?:\\s\\S+)?\\S*(?:\\s\\S+)?", options);

          //Loop through all search text
          for(int i=0; i!=searchText.length; i++){

            //Get text data and text+coord data
            String plainText = searchText[i];
            String coordText = coordsText[i];

            //So long as text data is not null
            if(plainText!=null){

              //Create two matchers for finding search term and teaser
              Matcher termFinder = searchTerm.matcher(plainText);
              Matcher teaserFinder = teaserTerm.matcher(plainText);
              boolean needToFindTeaser = true;

              //Keep looping till no result is returned
              while(termFinder.find()){
                resultStart = null;
                //Make note of the text found and index in the text
                String foundTerm = termFinder.group();
                int termStarts = termFinder.start();
                int termEnds = termFinder.end()-1;

                //If storing teasers
                if(includeTease){

                  //Store the term found as a default value
                  String teaser = foundTerm;

                  if(includeHTMLtags)
                    teaser = "<b>"+teaser+"</b>";

                  boolean itemFound = false;
                  if(needToFindTeaser){
                    itemFound = teaserFinder.find();
                  }

                  if(itemFound){
                    //Get a teaser if found and set the search term to bold is allowed
                    if(teaserFinder.start()<termStarts && teaserFinder.end()>termEnds){

                      //replace default with found teaser
                      teaser = teaserFinder.group();

                      if(includeHTMLtags){
                        //Calculate points to add bold tags
                        int teaseStarts = termStarts-teaserFinder.start();
                        int teaseEnds = (termEnds-teaserFinder.start())+1;

                        //Add bold tags
                        teaser = teaser.substring(0, teaseStarts) + "<b>" +
                        teaser.substring(teaseStarts, teaseEnds) + "</b>" +
                        teaser.substring(teaseEnds, teaser.length());
                      }
                      needToFindTeaser = true;
                    }else{
                      needToFindTeaser = false;
                    }
                  }

                  //Store teaser
                  resultTeasers.addElement(teaser);
                }

                //Get coords of found text for highlights
                float currentX = 0;
                float width = 0;

                //Track point in text data line (without coord data)
                int pointInLine = -1;

                //Track line on page
                int lineCounter = 0;

                //Skip null values and value not in the correct writing mode to ensure correct result coords
                while(content[lineCounter]==null || writingMode!=this.writingMode[lineCounter])
                  lineCounter++;

                //Flags used to catch if result is split accross lines
                boolean startFound = false;
                boolean endFound = false;

                //Cycle through coord text looking for coords of this result
                //Ignore first value as it is known to be the first marker
                for(int pointer=1; pointer<coordText.length(); pointer++){

                  // find second marker and get x coord
                  int startPointer = pointer;
                  while (pointer < coordText.length()) {
                    if (coordText.charAt(pointer) == MARKER2)
                      break;
                    pointer++;
                  }

                  //Convert text to float value for x coord
                  currentX = Float.parseFloat(coordText.substring(startPointer, pointer));
                  pointer++;


                  // find third marker and get width
                  startPointer = pointer;
                  while (pointer < coordText.length()) {
                    if (coordText.charAt(pointer) == MARKER2)
                      break;

                    pointer++;
                  }

                  //Convert text to float value for character width
                  width = Float.parseFloat(coordText.substring(startPointer, pointer));
                  pointer++;


                  // find fourth marker and get text (character)
                  startPointer = pointer;
                  while (pointer < coordText.length()) {
                    if (coordText.charAt(pointer) == MARKER2)
                      break;

                    pointer++;
                  }

                  //Store text to check for newline character later
                  String text = coordText.substring(startPointer, pointer);
                  pointInLine+=text.length();

                  //Start of term not found yet.
                  //Point in line is equal to or greater than start of the term.
                  //Store coords and mark start as found.
                  if(!startFound && pointInLine>=termStarts){
                    resultStart = new Point((int)currentX, (int)f_y1[lineCounter]);
                    startFound = true;
                  }

                  //End of term not found yet.
                  //Point in line is equal to or greater than end of the term.
                  //Store coords and mark end as found.
                  if(!endFound && pointInLine>=termEnds){
                    if (valuesSwapped){
                      if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
                        resultCoords.addElement((int) f_y2[lineCounter]);
                        resultCoords.addElement((int) currentX+width);
                        resultCoords.addElement(resultStart.y);
                        resultCoords.addElement(resultStart.x);
                        resultCoords.addElement(0.0f);
                      } else {
                        resultCoords.addElement((int) f_y2[lineCounter]);
                        resultCoords.addElement(resultStart.x);
                        resultCoords.addElement(resultStart.y);
                        resultCoords.addElement((int) currentX+width);
                        resultCoords.addElement(0.0f);
                      }
                    }else{
                      resultCoords.addElement(resultStart.x);
                      resultCoords.addElement(resultStart.y);
                      resultCoords.addElement(currentX + width);
                      resultCoords.addElement(f_y2[lineCounter]);
                      resultCoords.addElement(0.0f);
                    }

                    endFound = true;
                  }

                  //Using multi line option.
                  //Start of term found.
                  //End of term not found.
                  //New line character found.
                  //Set up multi line result.
                  if(startFound && !endFound && text.contains("\n")){
                    //Set ends coords
                    if (valuesSwapped){
                      if (writingMode == PdfData.VERTICAL_BOTTOM_TO_TOP) {
                        resultCoords.addElement((int) f_y2[lineCounter]);
                        resultCoords.addElement((int) currentX+width);
                        resultCoords.addElement(resultStart.y);
                        resultCoords.addElement(resultStart.x);
                        resultCoords.addElement(linkedSearchAreas); //Mark next result as linked

                      } else {
                        resultCoords.addElement((int) f_y2[lineCounter]);
                        resultCoords.addElement(resultStart.x);
                        resultCoords.addElement(resultStart.y);
                        resultCoords.addElement((int) currentX+width);
                        resultCoords.addElement(linkedSearchAreas); //Mark next result as linked

                      }
                    }else{
                      resultCoords.addElement(resultStart.x);
                      resultCoords.addElement(resultStart.y);
                      resultCoords.addElement(currentX + width);
                      resultCoords.addElement(f_y2[lineCounter]);
                      resultCoords.addElement(linkedSearchAreas); //Mark next result as linked
                    }
                    //Set start of term as not found
                    startFound = false;

                    //Set this point in line as start of next term
                    //Guarantees next character is found as
                    //start of the next part of the search term
                    termStarts = pointInLine;
                  }

                  //In multiline mode we progress the line number when we find a \n
                  //This is to allow the correct calculation of y coords
                  if(text.contains("\n")){
                    lineCounter++;

                    //If current content pointed at is null or not the correct writing mode, skip value until data is found
                    while(lineCounter<content.length && (content[lineCounter]==null || writingMode!=this.writingMode[lineCounter])){
                      lineCounter++;
                    }
                  }

                }

                //If only finding first occurance,
                //Stop searching this text data for search term.
                if(firstOccuranceOnly){
                  foundFirst = true;
                  break;
                }
              }

              //If only finding first occurance and first is found,
              //Stop searching all text data for this search term.
              if(firstOccuranceOnly && foundFirst){
                break;
              }
            }
          }

        }

        //Remove any trailing empty values
        resultCoords.trim();

        //If including tease values
        if(includeTease){

          //Remove any trailing empty values
          resultTeasers.trim();

          //Store teasers so they can be retrieved by different search methods
          if (usingMultipleTerms){
            //Store all teasers for so they may be returned as a sorted map
            //Only used for one method controled by the above flag
            for(int i=0; i!=resultTeasers.size(); i++)
              multipleTermTeasers.add(resultTeasers.elementAt(i));
          }else{
            //Store all teasers to be retrieved by getTeaser() method
            teasers = resultTeasers.get();
          }
        }
      }
    }
    //Return coord data for search results
    return resultCoords.get();
    
  }

  private static String removeDuplicateSpaces(String textValue) {
   
    if(textValue.contains("  ")){
     
      textValue=textValue.replace("  ", " ");
     
    }
    return textValue;
  }
 
 
 
  /**return endpoints from last findtext*/
  public float[] getEndPoints() {
    return endPoints;
  }

  /**return text teasers from findtext if generateTeasers() called 
   * before find
   */
  public String[] getTeasers() {
   
    return teasers;
  }
 
  /**
   * tell find text to generate teasers as well
   */
  public void generateTeasers() {
   
    includeTease=true;
  }
}
TOP

Related Classes of org.jpedal.grouping.PdfGroupingAlgorithms

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.