Package org.apache.lucene.analysis.charfilter

Source Code of org.apache.lucene.analysis.charfilter.NormalizeCharMap$Builder

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.analysis.charfilter;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;

import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;

// TODO: save/load?

/**
* Holds a map of String input to String output, to be used
* with {@link MappingCharFilter}.  Use the {@link Builder}
* to create this.
*/
public class NormalizeCharMap {

  final FST<CharsRef> map;
  final Map<Character,FST.Arc<CharsRef>> cachedRootArcs = new HashMap<Character,FST.Arc<CharsRef>>();

  // Use the builder to create:
  private NormalizeCharMap(FST<CharsRef> map) {
    this.map = map;
    if (map != null) {
      try {
        // Pre-cache root arcs:
        final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
        final FST.BytesReader fstReader = map.getBytesReader();
        map.getFirstArc(scratchArc);
        if (FST.targetHasArcs(scratchArc)) {
          map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
          while(true) {
            assert scratchArc.label != FST.END_LABEL;
            cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
            if (scratchArc.isLast()) {
              break;
            }
            map.readNextRealArc(scratchArc, fstReader);
          }
        }
        //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
      } catch (IOException ioe) {
        // Bogus FST IOExceptions!!  (will never happen)
        throw new RuntimeException(ioe);
      }
    }
  }

  /**
   * Builds an NormalizeCharMap.
   * <p>
   * Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap
   * @lucene.experimental
   */
  public static class Builder {

    private final Map<String,String> pendingPairs = new TreeMap<String,String>();

    /** Records a replacement to be applied to the input
     *  stream.  Whenever <code>singleMatch</code> occurs in
     *  the input, it will be replaced with
     *  <code>replacement</code>.
     *
     * @param match input String to be replaced
     * @param replacement output String
     * @throws IllegalArgumentException if
     * <code>match</code> is the empty string, or was
     * already previously added
     */
    public void add(String match, String replacement) {
      if (match.length() == 0 ){
        throw new IllegalArgumentException("cannot match the empty string");
      }
      if (pendingPairs.containsKey(match)) {
        throw new IllegalArgumentException("match \"" + match + "\" was already added");
      }
      pendingPairs.put(match, replacement);
    }

    /** Builds the NormalizeCharMap; call this once you
     *  are done calling {@link #add}. */
    public NormalizeCharMap build() {

      final FST<CharsRef> map;
      try {
        final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
        final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
        final IntsRef scratch = new IntsRef();
        for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
          builder.add(Util.toUTF16(ent.getKey(), scratch),
                      new CharsRef(ent.getValue()));
        }
        map = builder.finish();
        pendingPairs.clear();
      } catch (IOException ioe) {
        // Bogus FST IOExceptions!!  (will never happen)
        throw new RuntimeException(ioe);
      }

      return new NormalizeCharMap(map);
    }
  }
}
TOP

Related Classes of org.apache.lucene.analysis.charfilter.NormalizeCharMap$Builder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.