Source Code of org.apache.lucene.collation.TestICUCollationKeyFilterFactory$StringMockResourceLoader

package org.apache.lucene.collation;


/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.Map;


import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.Version;


import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;


@Deprecated
public class TestICUCollationKeyFilterFactory extends BaseTokenStreamTestCase {


  /*
   * Turkish has some funny casing.
   * This test shows how you can solve this kind of thing easily with collation.
   * Instead of using LowerCaseFilter, use a turkish collator with primary strength.
   * Then things will sort and match correctly.
   */
  public void testBasicUsage() throws Exception {
    String turkishUpperCase = "I WİLL USE TURKİSH CASING";
    String turkishLowerCase = "ı will use turkish casıng";
    TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
        "locale", "tr",
        "strength", "primary");
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(turkishUpperCase)));
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(turkishLowerCase)));
    assertCollatesToSame(tsUpper, tsLower);
  }
  
  /*
   * Test usage of the decomposition option for unicode normalization.
   */
  public void testNormalization() throws Exception {
    String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
    String turkishLowerCase = "ı will use turkish casıng";
    TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
        "locale", "tr",
        "strength", "primary",
        "decomposition", "canonical");
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(turkishUpperCase)));
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(turkishLowerCase)));
    assertCollatesToSame(tsUpper, tsLower);
  }
  
  /*
   * Test secondary strength, for english case is not significant.
   */
  public void testSecondaryStrength() throws Exception {
    String upperCase = "TESTING";
    String lowerCase = "testing";
    TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
        "locale", "en",
        "strength", "secondary",
        "decomposition", "no");
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(upperCase)));
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(lowerCase)));
    assertCollatesToSame(tsUpper, tsLower);
  }
  
  /*
   * Setting alternate=shifted to shift whitespace, punctuation and symbols
   * to quaternary level 
   */
  public void testIgnorePunctuation() throws Exception {
    String withPunctuation = "foo-bar";
    String withoutPunctuation = "foo bar";
    TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
        "locale", "en",
        "strength", "primary",
        "alternate", "shifted");
    TokenStream tsPunctuation = factory.create(
        new KeywordTokenizer(new StringReader(withPunctuation)));
    TokenStream tsWithoutPunctuation = factory.create(
        new KeywordTokenizer(new StringReader(withoutPunctuation)));
    assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
  }
  
  /*
   * Setting alternate=shifted and variableTop to shift whitespace, but not 
   * punctuation or symbols, to quaternary level 
   */
  public void testIgnoreWhitespace() throws Exception {
    String withSpace = "foo bar";
    String withoutSpace = "foobar";
    String withPunctuation = "foo-bar";
    TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
        "locale", "en",
        "strength", "primary",
        "alternate", "shifted",
        "variableTop", " ");
    TokenStream tsWithSpace = factory.create(
        new KeywordTokenizer(new StringReader(withSpace)));
    TokenStream tsWithoutSpace = factory.create(
        new KeywordTokenizer(new StringReader(withoutSpace)));
    assertCollatesToSame(tsWithSpace, tsWithoutSpace);
    // now assert that punctuation still matters: foo-bar < foo bar
    tsWithSpace = factory.create(
        new KeywordTokenizer(new StringReader(withSpace)));
    TokenStream tsWithPunctuation = factory.create(
        new KeywordTokenizer(new StringReader(withPunctuation)));
    assertCollation(tsWithPunctuation, tsWithSpace, -1);
  }
  
  /*
   * Setting numeric to encode digits with numeric value, so that
   * foobar-9 sorts before foobar-10
   */
  public void testNumerics() throws Exception {
    String nine = "foobar-9";
    String ten = "foobar-10";
    TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
        "locale", "en",
        "numeric", "true");
    TokenStream tsNine = factory.create(
        new KeywordTokenizer(new StringReader(nine)));
    TokenStream tsTen = factory.create(
        new KeywordTokenizer(new StringReader(ten)));
    assertCollation(tsNine, tsTen, -1);
  }
  
  /*
   * Setting caseLevel=true to create an additional case level between
   * secondary and tertiary
   */
  public void testIgnoreAccentsButNotCase() throws Exception {
    String withAccents = "résumé";
    String withoutAccents = "resume";
    String withAccentsUpperCase = "Résumé";
    String withoutAccentsUpperCase = "Resume";
    TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
        "locale", "en",
        "strength", "primary",
        "caseLevel", "true");
    TokenStream tsWithAccents = factory.create(
        new KeywordTokenizer(new StringReader(withAccents)));
    TokenStream tsWithoutAccents = factory.create(
        new KeywordTokenizer(new StringReader(withoutAccents)));
    assertCollatesToSame(tsWithAccents, tsWithoutAccents);
    
    TokenStream tsWithAccentsUpperCase = factory.create(
        new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
    TokenStream tsWithoutAccentsUpperCase = factory.create(
        new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
    assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
    
    // now assert that case still matters: resume < Resume
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(withoutAccents)));
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
    assertCollation(tsLower, tsUpper, -1);
  }
  
  /*
   * Setting caseFirst=upper to cause uppercase strings to sort
   * before lowercase ones.
   */
  public void testUpperCaseFirst() throws Exception {
    String lower = "resume";
    String upper = "Resume";
    TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
        "locale", "en",
        "strength", "tertiary",
        "caseFirst", "upper");
    TokenStream tsLower = factory.create(
        new KeywordTokenizer(new StringReader(lower)));
    TokenStream tsUpper = factory.create(
        new KeywordTokenizer(new StringReader(upper)));
    assertCollation(tsUpper, tsLower, -1);
  }


  /*
   * For german, you might want oe to sort and match with o umlaut.
   * This is not the default, but you can make a customized ruleset to do this.
   *
   * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
   *  http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
   */
  public void testCustomRules() throws Exception {
    RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));


    String DIN5007_2_tailorings =
      "& ae , a\u0308 & AE , A\u0308"+
      "& oe , o\u0308 & OE , O\u0308"+
      "& ue , u\u0308 & UE , u\u0308";


    RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
    String tailoredRules = tailoredCollator.getRules();
    //
    // at this point, you would save these tailoredRules to a file, 
    // and use the custom parameter.
    //
    String germanUmlaut = "Töne";
    String germanOE = "Toene";
    Map<String,String> args = new HashMap<String,String>();
    args.put("custom", "rules.txt");
    args.put("strength", "primary");
    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory(args);
    factory.inform(new StringMockResourceLoader(tailoredRules));
    TokenStream tsUmlaut = factory.create(
        new KeywordTokenizer(new StringReader(germanUmlaut)));
    TokenStream tsOE = factory.create(
        new KeywordTokenizer(new StringReader(germanOE)));


    assertCollatesToSame(tsUmlaut, tsOE);
  }
  
  private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
    assertCollation(stream1, stream2, 0);
  }
  
  private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
    CharTermAttribute term1 = stream1
        .addAttribute(CharTermAttribute.class);
    CharTermAttribute term2 = stream2
        .addAttribute(CharTermAttribute.class);
    stream1.reset();
    stream2.reset();
    assertTrue(stream1.incrementToken());
    assertTrue(stream2.incrementToken());
    assertEquals(Integer.signum(comparison), Integer.signum(term1.toString().compareTo(term2.toString())));
    assertFalse(stream1.incrementToken());
    assertFalse(stream2.incrementToken());
    stream1.end();
    stream2.end();
    stream1.close();
    stream2.close();
  }
  
  private class StringMockResourceLoader implements ResourceLoader {
    String text;


    StringMockResourceLoader(String text) {
      this.text = text;
    }


    public <T> T newInstance(String cname, Class<T> expectedType) {
      return null;
    }


    @Override
    public <T> Class<? extends T> findClass(String cname, Class<T> expectedType) {
      return null;
    }


    public InputStream openResource(String resource) throws IOException {
      return new ByteArrayInputStream(text.getBytes("UTF-8"));
    }
  }
  
  private TokenFilterFactory tokenFilterFactory(String name, String... keysAndValues) throws Exception {
    Class<? extends TokenFilterFactory> clazz = TokenFilterFactory.lookupClass(name);
    if (keysAndValues.length % 2 == 1) {
      throw new IllegalArgumentException("invalid keysAndValues map");
    }
    Map<String,String> args = new HashMap<String,String>();
    for (int i = 0; i < keysAndValues.length; i += 2) {
      String previous = args.put(keysAndValues[i], keysAndValues[i+1]);
      assertNull("duplicate values for key: " + keysAndValues[i], previous);
    }
    String previous = args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
    assertNull("duplicate values for key: luceneMatchVersion", previous);
    TokenFilterFactory factory = null;
    try {
      factory = clazz.getConstructor(Map.class).newInstance(args);
    } catch (InvocationTargetException e) {
      // to simplify tests that check for illegal parameters
      if (e.getCause() instanceof IllegalArgumentException) {
        throw (IllegalArgumentException) e.getCause();
      } else {
        throw e;
      }
    }
    if (factory instanceof ResourceLoaderAware) {
      ((ResourceLoaderAware) factory).inform(new ClasspathResourceLoader(getClass()));
    }
    return factory;
  }
}
Source Code of org.apache.lucene.collation.TestICUCollationKeyFilterFactory$StringMockResourceLoader

Related Classes of org.apache.lucene.collation.TestICUCollationKeyFilterFactory$StringMockResourceLoader