Source Code of org.apache.abdera.i18n.unicode.Normalizer

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  The ASF licenses this file to You
* under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.  For additional information regarding
* copyright in this work, please see the NOTICE file in the top level
* directory of this distribution.
*/
package org.apache.abdera.i18n.unicode;


import java.io.IOException;


import org.apache.abdera.i18n.io.CharUtils;
import org.apache.abdera.i18n.io.CodepointIterator;
import org.apache.abdera.i18n.unicode.UnicodeCharacterDatabase;




/**
 * Performs Unicode Normalization (Form D,C,KD and KC)
 */
public final class Normalizer {


  public enum Mask {
    NONE,
    COMPATIBILITY,
    COMPOSITION
  }
  
  public enum Form { 
    D, 
    C(Mask.COMPOSITION), 
    KD(Mask.COMPATIBILITY), 
    KC(Mask.COMPATIBILITY,Mask.COMPOSITION);
    
    private int mask = 0;


    Form(Mask... masks) {
      for (Mask mask : masks) {
        this.mask |= (mask.ordinal());
      }
    }
    
    public boolean isCompatibility() {
      return (mask & (Mask.COMPATIBILITY.ordinal())) != 0;
    }
    
    public boolean isCanonical() {
      return !isCompatibility();
    }
    
    public boolean isComposition() {
      return (mask & (Mask.COMPOSITION.ordinal())) != 0;
    }
  }
  
  private Normalizer() {}
  
  /**
   * Normalize the string using NFKC
   */
  public static StringBuffer normalize(String source) throws IOException {
    return normalize(source, Form.KC);
  }
  
  /**
   * Normalize the string using the specified Form
   */
  public static StringBuffer normalize(
    String source, 
    Form form) 
      throws IOException {
    return normalize(source, form, new StringBuffer());
  }
  
  /**
   * Normalize the string into the given StringBuffer using the given Form
   */
  public static StringBuffer normalize(
    String source, 
    Form form, 
    StringBuffer buf) 
      throws IOException {
      UnicodeCharacterDatabase ucd = UnicodeCharacterDatabase.getInstance();
      if (source.length() != 0 && ucd != null) {
        decompose(ucd, source, form, buf);
        compose(ucd, form, buf);
      }
      return buf;
  }
  
  private static void decompose(
    UnicodeCharacterDatabase ucd,
    String source, 
    Form form, 
    StringBuffer buf) 
      throws IOException {
      StringBuffer internal = new StringBuffer();
      CodepointIterator ci = CodepointIterator.forCharSequence(source);
      boolean canonical = form.isCanonical();
      while (ci.hasNext()) {
        int c = ci.next();
        internal.setLength(0);
        ucd.decompose(c, canonical, internal);
        CodepointIterator ii = CodepointIterator.forCharSequence(internal);
        while(ii.hasNext()) {
          int ch = ii.next();
          int i = findInsertionPoint(ucd, buf, ch);
          buf.insert(i,CharUtils.toString(ch));
        }
      }
    
  }
  
  private static int findInsertionPoint(
    UnicodeCharacterDatabase ucd, 
    StringBuffer buf, int c) {
    int cc = ucd.getCanonicalClass(c);
    int i = buf.length();
    if (cc != 0) {
      int ch;
      for (; i > 0; i -= CharUtils.size(c)) {
        ch = CharUtils.charAt(buf, i-1);
        if (ucd.getCanonicalClass(ch) <= cc) break;
      }
    }
    return i;
  }
  
  private static void compose(
    UnicodeCharacterDatabase ucd,
    Form form, 
    StringBuffer buf) 
      throws IOException {
    if (!form.isComposition()) return;
    int pos = 0;
    int lc = CharUtils.charAt(buf, pos);
    int cpos = CharUtils.size(lc);    
    int lcc = ucd.getCanonicalClass(lc);
    if (lcc != 0) lcc = 256;
    int len = buf.length();
    int c;
    for (int dpos = cpos; dpos < buf.length(); dpos += CharUtils.size(c)) {
      c = CharUtils.charAt(buf,dpos);
      int cc = ucd.getCanonicalClass(c);
      int composite = ucd.getPairComposition(lc, c);
      if (composite != '\uFFFF' && (lcc < cc || lcc == 0)) {
        CharUtils.setChar(buf, pos, composite);
        lc = composite;
      } else {
        if (cc == 0) {
          pos = cpos;
          lc = c;
        }
        lcc = cc;
        CharUtils.setChar(buf,cpos,c);
        if (buf.length() != len) {
          dpos += buf.length() - len;
          len = buf.length();
        }
        cpos += CharUtils.size(c);
      }
    }
    buf.setLength(cpos);
  }
  
  public static void main(String... args) throws Exception {
    
    UnicodeCharacterDatabase.main("i18n/src/main/resources/org/apache/abdera/i18n/unicode/data/ucd.res");
    
  }
}
Source Code of org.apache.abdera.i18n.unicode.Normalizer

Related Classes of org.apache.abdera.i18n.unicode.Normalizer