Package com.orientechnologies.orient.core.index

Source Code of com.orientechnologies.orient.core.index.OIndexFullText

/*
  *
  *  *  Copyright 2014 Orient Technologies LTD (info(at)orientechnologies.com)
  *  *
  *  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  *  you may not use this file except in compliance with the License.
  *  *  You may obtain a copy of the License at
  *  *
  *  *       http://www.apache.org/licenses/LICENSE-2.0
  *  *
  *  *  Unless required by applicable law or agreed to in writing, software
  *  *  distributed under the License is distributed on an "AS IS" BASIS,
  *  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *  *  See the License for the specific language governing permissions and
  *  *  limitations under the License.
  *  *
  *  * For more information: http://www.orientechnologies.com
  *
  */
package com.orientechnologies.orient.core.index;

import com.orientechnologies.common.listener.OProgressListener;
import com.orientechnologies.orient.core.db.record.OIdentifiable;
import com.orientechnologies.orient.core.db.record.ORecordElement;
import com.orientechnologies.orient.core.db.record.ridbag.sbtree.OIndexRIDContainer;
import com.orientechnologies.orient.core.record.impl.ODocument;
import com.orientechnologies.orient.core.serialization.serializer.OStringSerializerHelper;
import com.orientechnologies.orient.core.serialization.serializer.stream.OStreamSerializer;
import com.orientechnologies.orient.core.type.tree.OMVRBTreeRIDSet;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
* Fast index for full-text searches.
*
* @author Luca Garulli
*
*/
public class OIndexFullText extends OIndexMultiValues {

  private static final String  CONFIG_STOP_WORDS      = "stopWords";
  private static final String  CONFIG_SEPARATOR_CHARS = "separatorChars";
  private static final String  CONFIG_IGNORE_CHARS    = "ignoreChars";
  private static final String  CONFIG_INDEX_RADIX     = "indexRadix";
  private static final String  CONFIG_MIN_WORD_LEN    = "minWordLength";
  private static final boolean DEF_INDEX_RADIX        = true;
  private static final String  DEF_SEPARATOR_CHARS    = " \r\n\t:;,.|+*/\\=!?[]()";
  private static final String  DEF_IGNORE_CHARS       = "'\"";
  private static final String  DEF_STOP_WORDS         = "the in a at as and or for his her " + "him this that what which while "
                                                          + "up with be was were is";
  private static int           DEF_MIN_WORD_LENGTH    = 3;
  private boolean              indexRadix;
  private String               separatorChars;
  private String               ignoreChars;
  private int                  minWordLength;

  private Set<String>          stopWords;

  public OIndexFullText(String typeId, String algorithm, OIndexEngine<Set<OIdentifiable>> indexEngine,
      String valueContainerAlgorithm, ODocument metadata) {
    super(typeId, algorithm, indexEngine, valueContainerAlgorithm, metadata);
    config();
    configWithMetadata(metadata);

  }

  /**
   * Indexes a value and save the index. Splits the value in single words and index each one. Save of the index is responsibility of
   * the caller.
   */
  @Override
  public OIndexFullText put(Object key, final OIdentifiable iSingleValue) {
    checkForRebuild();

    if (key == null)
      return this;

    key = getCollatingValue(key);

    modificationLock.requestModificationLock();

    try {
      final Set<String> words = splitIntoWords(key.toString());

      // FOREACH WORD CREATE THE LINK TO THE CURRENT DOCUMENT
      for (final String word : words) {
        acquireExclusiveLock();
        startStorageAtomicOperation();
        try {
          Set<OIdentifiable> refs;

          // SEARCH FOR THE WORD
          refs = indexEngine.get(word);

          if (refs == null) {
            // WORD NOT EXISTS: CREATE THE KEYWORD CONTAINER THE FIRST TIME THE WORD IS FOUND
            if (ODefaultIndexFactory.SBTREEBONSAI_VALUE_CONTAINER.equals(valueContainerAlgorithm)) {
              boolean durable = false;
              if (metadata != null && Boolean.TRUE.equals(metadata.field("durableInNonTxMode")))
                durable = true;

              refs = new OIndexRIDContainer(getName(), durable);
            } else {
              refs = new OMVRBTreeRIDSet();
              ((OMVRBTreeRIDSet) refs).setAutoConvertToRecord(false);
            }
          }

          // ADD THE CURRENT DOCUMENT AS REF FOR THAT WORD
          refs.add(iSingleValue);

          // SAVE THE INDEX ENTRY
          indexEngine.put(word, refs);

          commitStorageAtomicOperation();
        } catch (RuntimeException e) {
          rollbackStorageAtomicOperation();
          throw new OIndexException("Error during put of key - value entry", e);
        } finally {
          releaseExclusiveLock();
        }
      }
      return this;
    } finally {
      modificationLock.releaseModificationLock();
    }
  }

  /**
   * Splits passed in key on several words and remove records with keys equals to any item of split result and values equals to
   * passed in value.
   *
   * @param key
   *          Key to remove.
   * @param value
   *          Value to remove.
   * @return <code>true</code> if at least one record is removed.
   */
  @Override
  public boolean remove(Object key, final OIdentifiable value) {
    checkForRebuild();

    key = getCollatingValue(key);

    modificationLock.requestModificationLock();

    try {
      final Set<String> words = splitIntoWords(key.toString());
      boolean removed = false;

      for (final String word : words) {
        acquireExclusiveLock();
        startStorageAtomicOperation();
        try {

          final Set<OIdentifiable> recs = indexEngine.get(word);
          if (recs != null && !recs.isEmpty()) {
            if (recs.remove(value)) {
              if (recs.isEmpty())
                indexEngine.remove(word);
              else
                indexEngine.put(word, recs);
              removed = true;
            }
          }
          commitStorageAtomicOperation();
        } catch (RuntimeException e) {
          rollbackStorageAtomicOperation();
          throw new OIndexException("Error during removal of entry by key and value", e);
        } finally {
          releaseExclusiveLock();
        }
      }

      return removed;
    } finally {
      modificationLock.releaseModificationLock();
    }
  }

  @Override
  public OIndexInternal<?> create(String name, OIndexDefinition indexDefinition, String clusterIndexName,
      Set<String> clustersToIndex, boolean rebuild, OProgressListener progressListener, OStreamSerializer valueSerializer) {

    if (indexDefinition.getFields().size() > 1) {
      throw new OIndexException(type + " indexes cannot be used as composite ones.");
    }

    return super.create(name, indexDefinition, clusterIndexName, clustersToIndex, rebuild, progressListener, valueSerializer);
  }

  @Override
  public OIndexMultiValues create(String name, OIndexDefinition indexDefinition, String clusterIndexName,
      Set<String> clustersToIndex, boolean rebuild, OProgressListener progressListener) {
    if (indexDefinition.getFields().size() > 1) {
      throw new OIndexException(type + " indexes cannot be used as composite ones.");
    }
    return super.create(name, indexDefinition, clusterIndexName, clustersToIndex, rebuild, progressListener);
  }

  @Override
  public ODocument updateConfiguration() {
    super.updateConfiguration();
    configuration.setInternalStatus(ORecordElement.STATUS.UNMARSHALLING);

    try {
      configuration.field(CONFIG_SEPARATOR_CHARS, separatorChars);
      configuration.field(CONFIG_IGNORE_CHARS, ignoreChars);
      configuration.field(CONFIG_STOP_WORDS, stopWords);
      configuration.field(CONFIG_MIN_WORD_LEN, minWordLength);
      configuration.field(CONFIG_INDEX_RADIX, indexRadix);

    } finally {
      configuration.setInternalStatus(ORecordElement.STATUS.LOADED);
    }
    return configuration;
  }

  public boolean canBeUsedInEqualityOperators() {
    return false;
  }

  public boolean supportsOrderedIterations() {
    return false;
  }

  protected void configWithMetadata(ODocument metadata) {
    if (metadata != null) {
      if (metadata.containsField(CONFIG_IGNORE_CHARS))
        ignoreChars = (String) metadata.field(CONFIG_IGNORE_CHARS);

      if (metadata.containsField(CONFIG_INDEX_RADIX))
        indexRadix = (Boolean) metadata.field(CONFIG_INDEX_RADIX);

      if (metadata.containsField(CONFIG_SEPARATOR_CHARS))
        separatorChars = (String) metadata.field(CONFIG_SEPARATOR_CHARS);

      if (metadata.containsField(CONFIG_MIN_WORD_LEN))
        minWordLength = (Integer) metadata.field(CONFIG_MIN_WORD_LEN);

      if (metadata.containsField(CONFIG_STOP_WORDS))
        stopWords = new HashSet<String>((Collection<? extends String>) metadata.field(CONFIG_STOP_WORDS));
    }

  }

  protected void config() {
    ignoreChars = DEF_IGNORE_CHARS;
    indexRadix = DEF_INDEX_RADIX;
    separatorChars = DEF_SEPARATOR_CHARS;
    minWordLength = DEF_MIN_WORD_LENGTH;
    stopWords = new HashSet<String>(OStringSerializerHelper.split(DEF_STOP_WORDS, ' '));
  }

  @Override
  protected void putInSnapshot(Object key, OIdentifiable value, Map<Object, Object> snapshot) {
    if (key == null)
      return;

    key = getCollatingValue(key);

    final Set<String> words = splitIntoWords(key.toString());

    // FOREACH WORD CREATE THE LINK TO THE CURRENT DOCUMENT
    for (final String word : words) {
      Set<OIdentifiable> refs;

      final Object snapshotValue = snapshot.get(word);
      if (snapshotValue == null)
        refs = indexEngine.get(word);
      else if (snapshotValue.equals(RemovedValue.INSTANCE))
        refs = null;
      else
        refs = (Set<OIdentifiable>) snapshotValue;

      if (refs == null) {
        // WORD NOT EXISTS: CREATE THE KEYWORD CONTAINER THE FIRST TIME THE WORD IS FOUND
        if (ODefaultIndexFactory.SBTREEBONSAI_VALUE_CONTAINER.equals(valueContainerAlgorithm)) {
          boolean durable = false;
          if (metadata != null && Boolean.TRUE.equals(metadata.field("durableInNonTxMode")))
            durable = true;

          refs = new OIndexRIDContainer(getName(), durable);
        } else {
          refs = new OMVRBTreeRIDSet();
          ((OMVRBTreeRIDSet) refs).setAutoConvertToRecord(false);
        }

        snapshot.put(word, refs);
      }
      // ADD THE CURRENT DOCUMENT AS REF FOR THAT WORD
      refs.add(value.getIdentity());
    }
  }

  @Override
  protected void removeFromSnapshot(Object key, OIdentifiable value, Map<Object, Object> snapshot) {
    key = getCollatingValue(key);

    final Set<String> words = splitIntoWords(key.toString());
    for (final String word : words) {
      final Set<OIdentifiable> recs;
      final Object snapshotValue = snapshot.get(word);
      if (snapshotValue == null)
        recs = indexEngine.get(word);
      else if (snapshotValue.equals(RemovedValue.INSTANCE))
        recs = null;
      else
        recs = (Set<OIdentifiable>) snapshotValue;

      if (recs != null && !recs.isEmpty()) {
        if (recs.remove(value)) {
          if (recs.isEmpty())
            snapshot.put(word, RemovedValue.INSTANCE);
          else
            snapshot.put(word, recs);
        }
      }
    }
  }

  private Set<String> splitIntoWords(final String iKey) {
    final Set<String> result = new HashSet<String>();

    final List<String> words = (List<String>) OStringSerializerHelper.split(new ArrayList<String>(), iKey, 0, -1, separatorChars);

    final StringBuilder buffer = new StringBuilder(64);
    // FOREACH WORD CREATE THE LINK TO THE CURRENT DOCUMENT

    char c;
    boolean ignore;
    for (String word : words) {
      buffer.setLength(0);

      for (int i = 0; i < word.length(); ++i) {
        c = word.charAt(i);
        ignore = false;
        for (int k = 0; k < ignoreChars.length(); ++k)
          if (c == ignoreChars.charAt(k)) {
            ignore = true;
            break;
          }

        if (!ignore)
          buffer.append(c);
      }

      int length = buffer.length();

      while (length >= minWordLength) {
        buffer.setLength(length);
        word = buffer.toString();

        // CHECK IF IT'S A STOP WORD
        if (!stopWords.contains(word))
          // ADD THE WORD TO THE RESULT SET
          result.add(word);

        if (indexRadix)
          length--;
        else
          break;
      }
    }

    return result;
  }
}
TOP

Related Classes of com.orientechnologies.orient.core.index.OIndexFullText

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.