Package com.jpetrak.gate.stringannotation.extendedgazetteer2.trie3

Source Code of com.jpetrak.gate.stringannotation.extendedgazetteer2.trie3.StoreArrayOfCharArrays

package com.jpetrak.gate.stringannotation.extendedgazetteer2.trie3;

import java.io.Serializable;

import gate.util.GateRuntimeException;
import it.unimi.dsi.fastutil.chars.CharBigArrayBigList;

/**
* This represents a store that can add and retrieve char[] elements. Each char[] chunk
* is identified by an int index which is returned when adding the chunk and which
* can be used to get it back. The number of chunks in a store is limited to 2^31, and
* the length of each chunk is also limited to 2^31, but the total amount of chars in
* the store can exceed 2^32 (unsigned 32 bit).
* <p>
*  The store supports storing the following:
<ul>
<li>Varying length chunks. These consist of two chars that encode the length of the data
*  (not including the length field) followed  by the data
<li>Fixed length chunks: just data, the client must know the length and supply it both when
*  adding and retrieving data
<li>Lists: each list consists of one or more elements of varying length chunks. The first
*  of a list consists of the the chunk length, the list size, the index of the next element chunk
*   and the actual chunk data for the first element. Elements 2-N consist of the chunk length,
*   the index of the next element chunk and the actual chunk data. Next chunk index and in the
*   first element, the list size are included in chunk length, the chunk length field itself
*   is not.
</ul>
*
* @author Johann Petrak
*
*/
public class StoreArrayOfCharArrays implements Serializable {
  
  /**
   *
   */
  private static final long serialVersionUID = 1238979454339893943L;
  CharBigArrayBigList theList = new CharBigArrayBigList();
  int curIndex = 0;
 
  private char[] zeroChars = Utils.int2TwoChars(0);
  private char[] oneChars = Utils.int2TwoChars(1);
 
  /**
   * Add some data and get back the index under which we can get it back
   * @param data
   * @return
   */
  public int addData(char[] data) {
    // remember where we store the data
    int oldIndex = curIndex;
    // first store the length of the data
    // we split the int that represents the length into to chars
    int l = data.length;
    char[] lAsChars = Utils.int2TwoChars(l);
    addChars(lAsChars);
    addChars(data);
    // after storing, the new index is now moved by the length of the data
    // plus the two chars where we store the length
    curIndex += data.length+2;
    return oldIndex;
  }

  /**
   * Add some data and get back the index under which we can get it back. This will
   * add a chunk of data of known length to the store: no length is stored in the
   * store for this chunk. This chunk can only be retrieved with the getFixedLengthData
   * method.
   *
   * @param data
   * @return
   */
  public int addFixedLengthData(char[] data) {
    // remember where we store the data
    int oldIndex = curIndex;
    addChars(data);
    // after storing, the new index is now moved by the length of the data
    curIndex += data.length;
    return oldIndex;
  }
 
  public int replaceFixedLengthData(int index, char[] data) {
    for(int i = 0; i<data.length; i++) {
      theList.set(index+i,data[i]);
    }
    return index;
  }
 
  /**
   * Get the data from at the given index.
   * @param index
   * @return
   */
  public char[] getData(int index) {
    // retrieve the length
    int l = Utils.twoChars2Int(theList.get(index), theList.get(index+1));
    // now retrieve the characters for this data block
    char data[] = new char[l];
    for(int i=0; i<l; i++) {
      data[i] = theList.get(index+2+i);
    }
    return data;
  }

  /**
   * Get data of known length from at the given index.
   * @param index
   * @return
   */
  public char[] getFixedLengthData(int index, int length) {
    char data[] = new char[length];
    for(int i=0; i<length; i++) {
      data[i] = theList.get(index+i);
    }
    return data;
  }
 
  /**
   * Add a new list to the store and return its index. After this, a list with
   * length of 1 is stored.
   *
   * @param data
   * @return
   */
  public int addListData(char[] data) {
    // create the special first list entry:
    // = length of list (int=2 chars), set to 1
    // = index of next list entry (int=2 chars), set to 0
    // = actual data
    // remember where we store the data
    int oldIndex = curIndex;
    // first store the length of the data: for the first list entry
    // this also includes size and next element index, so add 4
    // we split the int that represents the length into to chars
    int l = data.length+4;
    char[] lAsChars = Utils.int2TwoChars(l);
    addChars(lAsChars);
    addChars(oneChars);
    addChars(zeroChars);
    addChars(data);
    // after storing, the new index is now moved by the length of the data
    // plus the two chars where we store the length
    curIndex += data.length+6;
    return oldIndex;   
  }
 
  /**
   * Append additional data blocks to a list that already exists in the store at
   * the given index. if index is <=0, add a new list.
   *
   * @param index
   * @param data
   * @return
   */
  // TODO: instead of doing getData to get the element indices, just directly access
  // the characters
  public int addListData(int index, char[] data) {
    if(index <= 0) {
      return addListData(data);
    }

    int size = getListSize(index);
    if(size < 1) {
      throw new GateRuntimeException("Adding to a list, but size is <1: "+size);
    }

   
    // update the size
    char sz[] = Utils.int2TwoChars(size+1);
    theList.set(index+2,sz[0]); // just skip the data length characters: 2 characters
    theList.set(index+3,sz[1]);
   
   
    // store the new data
    int newBlockIndex = addNewListBlock(data);
   
    // now add the index of that block to either the first block or
    // dereference until we are the correct block
   
    int curBlockIndex = index;
    // if we need to append not at the first block (size!=1, get the next block which
    // corresponds to size=2   
    // then if size > 2, iterate as often as still needed
    if(size > 1) {
      curBlockIndex = getNextElementIndex4First(index);
      for(int i=2; i<size; i++) {
        curBlockIndex = getNextElementIndex4Other(curBlockIndex);
      }
    }
   
    // encode the new block index
    char idx[] = Utils.int2TwoChars(newBlockIndex);
   
   
    if(size == 1) {
      theList.set(index+4,idx[0]);
      theList.set(index+5,idx[1]);           
    } else {
      theList.set(curBlockIndex+2,idx[0]);
      theList.set(curBlockIndex+3,idx[1]);                 
    }
   
   
    return index;
  }
 
  /**
   * Return the list element at the given index.
   * @param index
   * @param element
   * @return
   */
  public char[] getListData(int index, int element) {
    // get the first block which must exist
    int size = getListSize(index);
    if(size <= 0) {
      throw new GateRuntimeException("getting list data but size is <=0: "+size);
    }
    assert(element<size);
    if(element >= size) {
      throw new GateRuntimeException("getting list data but element>=size: "+element+"/"+size);
    }
   
    if(size == 1 && element != 0) {
      throw new GateRuntimeException("getting list data but size=1 and element!=0: "+element);
    }
    if(element == 0) {
      return getDataWithout(index, 4);
    }
    // if we need an element >0,
    // de-reference the current block "element" times and get the data from there
    int nextBlockIndex = getNextElementIndex4First(index);
    // we did already the first dereferencing from the first block, so if necessary
    // dereference element-1 more times
    for(int i=0;i<(element-1);i++) {
      nextBlockIndex = getNextElementIndex4Other(nextBlockIndex);
    }
    // we have the index of the block we want, return it
    return getDataWithout(nextBlockIndex,2);
  }
 
  /**
   * Find the chunk among all the list elements stored at index and
   * return the index of the element (>= 0) if found or -1 if not found.
   *
   * @param index index of the list in the store
   * @param chunk the chunk to find
   * @return the index of the chunk in the list or -1 if not found
   */
  public int findListData(int index, char[] chunk) {
    int elementIndex = 0;
   
    // if the list exists at all, there always must be at least one element, so
    // always check the first element.
    // Find the start and the length of the first element and compare
    int length = Utils.twoChars2Int(theList.get(index), theList.get(index+1));
    int chunkIndex = index+6// 2 for the chunk length, 2 for list size,, 2 for next element index
    if(isChunkEqual(chunkIndex,length-4,chunk)) {
      return elementIndex;
    }
    // get the next chunk pointer
    int nextBlockIndex = getNextElementIndex4First(index);
    while(nextBlockIndex != 0) {
      elementIndex++;
      // now check the block at this index!
      length = Utils.twoChars2Int(theList.get(nextBlockIndex), theList.get(nextBlockIndex+1));
      chunkIndex = nextBlockIndex+4; // 2 for chunk length, 2 for next element index
      if(isChunkEqual(chunkIndex,length-2,chunk)) {
        return elementIndex;
      }
      nextBlockIndex = getNextElementIndex4Other(nextBlockIndex);
    }
    return -1;
  }

 
  /**
   * Check if the characters in the store, starting at index and having length length,
   * are identical to the characters of the chunk.
   * @param index
   * @param length
   * @param chunk
   * @return
   */
  protected boolean isChunkEqual(int index, int length, char[] chunk) {
    if(chunk.length != length) {
      return false;
    }
    for(int i = 0; i<length; i++) {
      if(theList.get(index+i) != chunk[i]) {
        return false;
      }
    }
    return true;
  }
 
  /**
   * Return the size of the list at the given index
   * @param index
   * @return
   */
  public int getListSize(int index) {
   return Utils.twoChars2Int(theList.get(index+2), theList.get(index+3));
  }
 
  //*******************************************************************
 
 
  private int getNextElementIndex4First(int index) {
    return Utils.twoChars2Int(theList.get(index+4), theList.get(index+5));
  }
  private int getNextElementIndex4Other(int index) {
    return Utils.twoChars2Int(theList.get(index+2), theList.get(index+3));
  }
 
  // special method to retrieve just the data from a list element: same as
  // the ordinary get data, except the we have to skip to chars at the
  // beginning which is the next element pointer, or the size and next element pointer
  // without is 2 or 4 for these.
  private char[] getDataWithout(int index, int without) {
    // retrieve the length
    int l = Utils.twoChars2Int(theList.get(index), theList.get(index+1));
    // now retrieve the characters for this data block
    char data[] = new char[l-without];
    for(int i=0; i<(l-without); i++) {
      data[i] = theList.get(index+2+without+i);
    }
    return data;
  }
 
  // similar to addData but also adds the empty next block entry at the beginning
  private int addNewListBlock(char[] data) {
    // remember where we store the data
    int oldIndex = curIndex;
    // first store the length of the data
    // we split the int that represents the length into to chars
    int l = data.length+2;
    char[] lAsChars = Utils.int2TwoChars(l);
    addChars(lAsChars);
    addChars(zeroChars);
    addChars(data);
    // after storing, the new index is now moved by the length of the data
    // plus the two chars where we store the length
    curIndex += data.length+4;
    return oldIndex;   
  }
 
 
  private void addChars(char[] cs) {
    for(char c : cs) {
      theList.add(c);
    }
  }
 
 
}
TOP

Related Classes of com.jpetrak.gate.stringannotation.extendedgazetteer2.trie3.StoreArrayOfCharArrays

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.