/*
*******************************************************************************
* Copyright (C) 2009-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.VersionInfo;
public final class Normalizer2Impl {
public static final class Hangul {
/* Korean Hangul and Jamo constants */
public static final int JAMO_L_BASE = 0x1100; /* "lead" jamo */
public static final int JAMO_V_BASE = 0x1161; /* "vowel" jamo */
public static final int JAMO_T_BASE = 0x11a7; /* "trail" jamo */
public static final int HANGUL_BASE = 0xac00;
public static final int JAMO_L_COUNT = 19;
public static final int JAMO_V_COUNT = 21;
public static final int JAMO_T_COUNT = 28;
public static final int JAMO_L_LIMIT = JAMO_L_BASE + JAMO_L_COUNT;
public static final int JAMO_V_LIMIT = JAMO_V_BASE + JAMO_V_COUNT;
public static final int JAMO_VT_COUNT = JAMO_V_COUNT * JAMO_T_COUNT;
public static final int HANGUL_COUNT = JAMO_L_COUNT * JAMO_V_COUNT * JAMO_T_COUNT;
public static final int HANGUL_LIMIT = HANGUL_BASE + HANGUL_COUNT;
public static boolean isHangul(final int c) {
return HANGUL_BASE <= c && c < HANGUL_LIMIT;
}
public static boolean isHangulWithoutJamoT(char c) {
c -= HANGUL_BASE;
return c < HANGUL_COUNT && c % JAMO_T_COUNT == 0;
}
public static boolean isJamoL(final int c) {
return JAMO_L_BASE <= c && c < JAMO_L_LIMIT;
}
public static boolean isJamoV(final int c) {
return JAMO_V_BASE <= c && c < JAMO_V_LIMIT;
}
/**
* Decomposes c, which must be a Hangul syllable, into buffer and returns the length of the decomposition (2 or 3).
*/
public static int decompose(int c, final Appendable buffer) {
try {
c -= HANGUL_BASE;
int c2 = c % JAMO_T_COUNT;
c /= JAMO_T_COUNT;
buffer.append((char) (JAMO_L_BASE + c / JAMO_V_COUNT));
buffer.append((char) (JAMO_V_BASE + c % JAMO_V_COUNT));
if (c2 == 0) {
return 2;
} else {
buffer.append((char) (JAMO_T_BASE + c2));
return 3;
}
} catch (IOException e) {
// Will not occur because we do not write to I/O.
throw new RuntimeException(e);
}
}
/**
* Decomposes c, which must be a Hangul syllable, into buffer. This is the raw, not recursive, decomposition. Its length is always
* 2.
*/
public static void getRawDecomposition(int c, final Appendable buffer) {
try {
int orig = c;
c -= HANGUL_BASE;
int c2 = c % JAMO_T_COUNT;
if (c2 == 0) {
c /= JAMO_T_COUNT;
buffer.append((char) (JAMO_L_BASE + c / JAMO_V_COUNT));
buffer.append((char) (JAMO_V_BASE + c % JAMO_V_COUNT));
} else {
buffer.append((char) (orig - c2)); // LV syllable
buffer.append((char) (JAMO_T_BASE + c2));
}
} catch (IOException e) {
// Will not occur because we do not write to I/O.
throw new RuntimeException(e);
}
}
}
/**
* Writable buffer that takes care of canonical ordering. Its Appendable methods behave like the C++ implementation's appendZeroCC()
* methods.
* <p>
* If dest is a StringBuilder, then the buffer writes directly to it. Otherwise, the buffer maintains a StringBuilder for intermediate
* text segments until no further changes are necessary and whole segments are appended. append() methods that take combining-class
* values always write to the StringBuilder. Other append() methods flush and append to the Appendable.
*/
public static final class ReorderingBuffer implements Appendable {
public ReorderingBuffer(final Normalizer2Impl ni, final Appendable dest, final int destCapacity) {
impl = ni;
app = dest;
if (app instanceof StringBuilder) {
appIsStringBuilder = true;
str = (StringBuilder) dest;
// In Java, the constructor subsumes public void init(int destCapacity) {
str.ensureCapacity(destCapacity);
reorderStart = 0;
if (str.length() == 0) {
lastCC = 0;
} else {
setIterator();
lastCC = previousCC();
// Set reorderStart after the last code point with cc<=1 if there is one.
if (lastCC > 1) {
while (previousCC() > 1) {
}
}
reorderStart = codePointLimit;
}
} else {
appIsStringBuilder = false;
str = new StringBuilder();
reorderStart = 0;
lastCC = 0;
}
}
public boolean isEmpty() {
return str.length() == 0;
}
public int length() {
return str.length();
}
public int getLastCC() {
return lastCC;
}
public StringBuilder getStringBuilder() {
return str;
}
public boolean equals(final CharSequence s, final int start, final int limit) {
return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
}
// For Hangul composition, replacing the Leading consonant Jamo with the syllable.
public void setLastChar(final char c) {
str.setCharAt(str.length() - 1, c);
}
public void append(final int c, final int cc) {
if (lastCC <= cc || cc == 0) {
str.appendCodePoint(c);
lastCC = cc;
if (cc <= 1) {
reorderStart = str.length();
}
} else {
insert(c, cc);
}
}
// s must be in NFD, otherwise change the implementation.
public void append(final CharSequence s, int start, final int limit, int leadCC, final int trailCC) {
if (start == limit) {
return;
}
if (lastCC <= leadCC || leadCC == 0) {
if (trailCC <= 1) {
reorderStart = str.length() + (limit - start);
} else if (leadCC <= 1) {
reorderStart = str.length() + 1; // Ok if not a code point boundary.
}
str.append(s, start, limit);
lastCC = trailCC;
} else {
int c = Character.codePointAt(s, start);
start += Character.charCount(c);
insert(c, leadCC); // insert first code point
while (start < limit) {
c = Character.codePointAt(s, start);
start += Character.charCount(c);
if (start < limit) {
// s must be in NFD, otherwise we need to use getCC().
leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
} else {
leadCC = trailCC;
}
append(c, leadCC);
}
}
}
// The following append() methods work like C++ appendZeroCC().
// They assume that the cc or trailCC of their input is 0.
// Most of them implement Appendable interface methods.
// @Override when we switch to Java 6
public ReorderingBuffer append(final char c) {
str.append(c);
lastCC = 0;
reorderStart = str.length();
return this;
}
public void appendZeroCC(final int c) {
str.appendCodePoint(c);
lastCC = 0;
reorderStart = str.length();
}
// @Override when we switch to Java 6
public ReorderingBuffer append(final CharSequence s) {
if (s.length() != 0) {
str.append(s);
lastCC = 0;
reorderStart = str.length();
}
return this;
}
// @Override when we switch to Java 6
public ReorderingBuffer append(final CharSequence s, final int start, final int limit) {
if (start != limit) {
str.append(s, start, limit);
lastCC = 0;
reorderStart = str.length();
}
return this;
}
/**
* Flushes from the intermediate StringBuilder to the Appendable, if they are different objects. Used after recomposition. Must be
* called at the end when writing to a non-StringBuilder Appendable.
*/
public void flush() {
if (appIsStringBuilder) {
reorderStart = str.length();
} else {
try {
app.append(str);
str.setLength(0);
reorderStart = 0;
} catch (IOException e) {
throw new RuntimeException(e); // Avoid declaring "throws IOException".
}
}
lastCC = 0;
}
/**
* Flushes from the intermediate StringBuilder to the Appendable, if they are different objects. Then appends the new text to the
* Appendable or StringBuilder. Normally used after quick check loops find a non-empty sequence.
*/
public ReorderingBuffer flushAndAppendZeroCC(final CharSequence s, final int start, final int limit) {
if (appIsStringBuilder) {
str.append(s, start, limit);
reorderStart = str.length();
} else {
try {
app.append(str).append(s, start, limit);
str.setLength(0);
reorderStart = 0;
} catch (IOException e) {
throw new RuntimeException(e); // Avoid declaring "throws IOException".
}
}
lastCC = 0;
return this;
}
public void remove() {
str.setLength(0);
lastCC = 0;
reorderStart = 0;
}
public void removeSuffix(final int suffixLength) {
int oldLength = str.length();
str.delete(oldLength - suffixLength, oldLength);
lastCC = 0;
reorderStart = str.length();
}
/*
* TODO: Revisit whether it makes sense to track reorderStart.
* It is set to after the last known character with cc<=1,
* which stops previousCC() before it reads that character and looks up its cc.
* previousCC() is normally only called from insert().
* In other words, reorderStart speeds up the insertion of a combining mark
* into a multi-combining mark sequence where it does not belong at the end.
* This might not be worth the trouble.
* On the other hand, it's not a huge amount of trouble.
*
* We probably need it for UNORM_SIMPLE_APPEND.
*/
// Inserts c somewhere before the last character.
// Requires 0<cc<lastCC which implies reorderStart<limit.
private void insert(final int c, final int cc) {
for (setIterator(), skipPrevious(); previousCC() > cc;) {
}
// insert c at codePointLimit, after the character with prevCC<=cc
if (c <= 0xffff) {
str.insert(codePointLimit, (char) c);
if (cc <= 1) {
reorderStart = codePointLimit + 1;
}
} else {
str.insert(codePointLimit, Character.toChars(c));
if (cc <= 1) {
reorderStart = codePointLimit + 2;
}
}
}
private final Normalizer2Impl impl;
private final Appendable app;
private final StringBuilder str;
private final boolean appIsStringBuilder;
private int reorderStart;
private int lastCC;
// private backward iterator
private void setIterator() {
codePointStart = str.length();
}
private void skipPrevious() { // Requires 0<codePointStart.
codePointLimit = codePointStart;
codePointStart = str.offsetByCodePoints(codePointStart, -1);
}
private int previousCC() { // Returns 0 if there is no previous character.
codePointLimit = codePointStart;
if (reorderStart >= codePointStart) {
return 0;
}
int c = str.codePointBefore(codePointStart);
codePointStart -= Character.charCount(c);
if (c < MIN_CCC_LCCC_CP) {
return 0;
}
return getCCFromYesOrMaybe(impl.getNorm16(c));
}
private int codePointStart, codePointLimit;
}
// TODO: Propose as public API on the UTF16 class.
// TODO: Propose widening UTF16 methods that take char to take int.
// TODO: Propose widening UTF16 methods that take String to take CharSequence.
public static final class UTF16Plus {
/**
* Assuming c is a surrogate code point (UTF16.isSurrogate(c)), is it a lead surrogate?
*
* @param c
* code unit or code point
* @return true or false
*/
public static boolean isSurrogateLead(final int c) {
return (c & 0x400) == 0;
}
/**
* Compares two CharSequence objects for binary equality.
*
* @param s1
* first sequence
* @param s2
* second sequence
* @return true if s1 contains the same text as s2
*/
public static boolean equal(final CharSequence s1, final CharSequence s2) {
if (s1 == s2) {
return true;
}
int length = s1.length();
if (length != s2.length()) {
return false;
}
for (int i = 0; i < length; ++i) {
if (s1.charAt(i) != s2.charAt(i)) {
return false;
}
}
return true;
}
/**
* Compares two CharSequence subsequences for binary equality.
*
* @param s1
* first sequence
* @param start1
* start offset in first sequence
* @param limit1
* limit offset in first sequence
* @param s2
* second sequence
* @param start2
* start offset in second sequence
* @param limit2
* limit offset in second sequence
* @return true if s1.subSequence(start1, limit1) contains the same text as s2.subSequence(start2, limit2)
*/
public static boolean equal(final CharSequence s1, int start1, final int limit1, final CharSequence s2, int start2, final int limit2) {
if ((limit1 - start1) != (limit2 - start2)) {
return false;
}
if (s1 == s2 && start1 == start2) {
return true;
}
while (start1 < limit1) {
if (s1.charAt(start1++) != s2.charAt(start2++)) {
return false;
}
}
return true;
}
}
public Normalizer2Impl() {
}
private static final class IsAcceptable implements ICUBinary.Authenticate {
// @Override when we switch to Java 6
public boolean isDataVersionAcceptable(final byte version[]) {
return version[0] == 2;
}
}
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
private static final byte DATA_FORMAT[] = { 0x4e, 0x72, 0x6d, 0x32 }; // "Nrm2"
public Normalizer2Impl load(final InputStream data) {
try {
BufferedInputStream bis = new BufferedInputStream(data);
dataVersion = ICUBinary.readHeaderAndDataVersion(bis, DATA_FORMAT, IS_ACCEPTABLE);
DataInputStream ds = new DataInputStream(bis);
int indexesLength = ds.readInt() / 4; // inIndexes[IX_NORM_TRIE_OFFSET]/4
if (indexesLength <= IX_MIN_MAYBE_YES) {
throw new IOException("Normalizer2 data: not enough indexes");
}
int[] inIndexes = new int[indexesLength];
inIndexes[0] = indexesLength * 4;
for (int i = 1; i < indexesLength; ++i) {
inIndexes[i] = ds.readInt();
}
minDecompNoCP = inIndexes[IX_MIN_DECOMP_NO_CP];
minCompNoMaybeCP = inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
minYesNo = inIndexes[IX_MIN_YES_NO];
minYesNoMappingsOnly = inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
minNoNo = inIndexes[IX_MIN_NO_NO];
limitNoNo = inIndexes[IX_LIMIT_NO_NO];
minMaybeYes = inIndexes[IX_MIN_MAYBE_YES];
// Read the normTrie.
int offset = inIndexes[IX_NORM_TRIE_OFFSET];
int nextOffset = inIndexes[IX_EXTRA_DATA_OFFSET];
normTrie = Trie2_16.createFromSerialized(ds);
int trieLength = normTrie.getSerializedLength();
if (trieLength > (nextOffset - offset)) {
throw new IOException("Normalizer2 data: not enough bytes for normTrie");
}
ds.skipBytes((nextOffset - offset) - trieLength); // skip padding after trie bytes
// Read the composition and mapping data.
offset = nextOffset;
nextOffset = inIndexes[IX_SMALL_FCD_OFFSET];
int numChars = (nextOffset - offset) / 2;
char[] chars;
if (numChars != 0) {
chars = new char[numChars];
for (int i = 0; i < numChars; ++i) {
chars[i] = ds.readChar();
}
maybeYesCompositions = new String(chars);
extraData = maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES - minMaybeYes);
}
// smallFCD: new in formatVersion 2
offset = nextOffset;
smallFCD = new byte[0x100];
for (int i = 0; i < 0x100; ++i) {
smallFCD[i] = ds.readByte();
}
// Build tccc180[].
// gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
tccc180 = new int[0x180];
int bits = 0;
for (int c = 0; c < 0x180; bits >>= 1) {
if ((c & 0xff) == 0) {
bits = smallFCD[c >> 8]; // one byte per 0x100 code points
}
if ((bits & 1) != 0) {
for (int i = 0; i < 0x20; ++i, ++c) {
tccc180[c] = getFCD16FromNormData(c) & 0xff;
}
} else {
c += 0x20;
}
}
data.close();
return this;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public Normalizer2Impl load(final String name) {
return load(ICUData.getRequiredStream(name));
}
public void addPropertyStarts(final UnicodeSet set) {
/* add the start code point of each same-value range of each trie */
Iterator<Trie2.Range> trieIterator = normTrie.iterator();
Trie2.Range range;
while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
/* add the start code point to the USet */
set.add(range.startCodePoint);
}
/* add Hangul LV syllables and LV+1 because of skippables */
for (int c = Hangul.HANGUL_BASE; c < Hangul.HANGUL_LIMIT; c += Hangul.JAMO_T_COUNT) {
set.add(c);
set.add(c + 1);
}
set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
}
public void addCanonIterPropertyStarts(final UnicodeSet set) {
/* add the start code point of each same-value range of the canonical iterator data trie */
ensureCanonIterData();
// currently only used for the SEGMENT_STARTER property
Iterator<Trie2.Range> trieIterator = canonIterData.iterator(segmentStarterMapper);
Trie2.Range range;
while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
/* add the start code point to the USet */
set.add(range.startCodePoint);
}
}
private static final Trie2.ValueMapper segmentStarterMapper = new Trie2.ValueMapper() {
public int map(final int in) {
return in & CANON_NOT_SEGMENT_STARTER;
}
};
// low-level properties ------------------------------------------------ ***
public Trie2_16 getNormTrie() {
return normTrie;
}
// Note: Normalizer2Impl.java r30983 (2011-nov-27)
// still had getFCDTrie() which built and cached an FCD trie.
// That provided faster access to FCD data than getFCD16FromNormData()
// but required synchronization and consumed some 10kB of heap memory
// in any process that uses FCD (e.g., via collation).
// tccc180[] and smallFCD[] are intended to help with any loss of performance,
// at least for Latin & CJK.
/**
* Builds the canonical-iterator data for this instance. This is required before any of {@link #isCanonSegmentStarter(int)} or
* {@link #getCanonStartSet(int, UnicodeSet)} are called, or else they crash.
*
* @return this
*/
public synchronized Normalizer2Impl ensureCanonIterData() {
if (canonIterData == null) {
Trie2Writable newData = new Trie2Writable(0, 0);
canonStartSets = new ArrayList<UnicodeSet>();
Iterator<Trie2.Range> trieIterator = normTrie.iterator();
Trie2.Range range;
while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
final int norm16 = range.value;
if (norm16 == 0 || (minYesNo <= norm16 && norm16 < minNoNo)) {
// Inert, or 2-way mapping (including Hangul syllable).
// We do not write a canonStartSet for any yesNo character.
// Composites from 2-way mappings are added at runtime from the
// starter's compositions list, and the other characters in
// 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
// "maybe" characters.
continue;
}
for (int c = range.startCodePoint; c <= range.endCodePoint; ++c) {
final int oldValue = newData.get(c);
int newValue = oldValue;
if (norm16 >= minMaybeYes) {
// not a segment starter if it occurs in a decomposition or has cc!=0
newValue |= CANON_NOT_SEGMENT_STARTER;
if (norm16 < MIN_NORMAL_MAYBE_YES) {
newValue |= CANON_HAS_COMPOSITIONS;
}
} else if (norm16 < minYesNo) {
newValue |= CANON_HAS_COMPOSITIONS;
} else {
// c has a one-way decomposition
int c2 = c;
int norm16_2 = norm16;
while (limitNoNo <= norm16_2 && norm16_2 < minMaybeYes) {
c2 = this.mapAlgorithmic(c2, norm16_2);
norm16_2 = getNorm16(c2);
}
if (minYesNo <= norm16_2 && norm16_2 < limitNoNo) {
// c decomposes, get everything from the variable-length extra data
int firstUnit = extraData.charAt(norm16_2);
int length = firstUnit & MAPPING_LENGTH_MASK;
if ((firstUnit & MAPPING_HAS_CCC_LCCC_WORD) != 0) {
if (c == c2 && (extraData.charAt(norm16_2 - 1) & 0xff) != 0) {
newValue |= CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
}
}
// Skip empty mappings (no characters in the decomposition).
if (length != 0) {
++norm16_2; // skip over the firstUnit
// add c to first code point's start set
int limit = norm16_2 + length;
c2 = extraData.codePointAt(norm16_2);
addToStartSet(newData, c, c2);
// Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
// one-way mapping. A 2-way mapping is possible here after
// intermediate algorithmic mapping.
if (norm16_2 >= minNoNo) {
while ((norm16_2 += Character.charCount(c2)) < limit) {
c2 = extraData.codePointAt(norm16_2);
int c2Value = newData.get(c2);
if ((c2Value & CANON_NOT_SEGMENT_STARTER) == 0) {
newData.set(c2, c2Value | CANON_NOT_SEGMENT_STARTER);
}
}
}
}
} else {
// c decomposed to c2 algorithmically; c has cc==0
addToStartSet(newData, c, c2);
}
}
if (newValue != oldValue) {
newData.set(c, newValue);
}
}
}
canonIterData = newData.toTrie2_32();
}
return this;
}
public int getNorm16(final int c) {
return normTrie.get(c);
}
public int getCompQuickCheck(final int norm16) {
if (norm16 < minNoNo || MIN_YES_YES_WITH_CC <= norm16) {
return 1; // yes
} else if (minMaybeYes <= norm16) {
return 2; // maybe
} else {
return 0; // no
}
}
public boolean isCompNo(final int norm16) {
return minNoNo <= norm16 && norm16 < minMaybeYes;
}
public boolean isDecompYes(final int norm16) {
return norm16 < minYesNo || minMaybeYes <= norm16;
}
public int getCC(final int norm16) {
if (norm16 >= MIN_NORMAL_MAYBE_YES) {
return norm16 & 0xff;
}
if (norm16 < minNoNo || limitNoNo <= norm16) {
return 0;
}
return getCCFromNoNo(norm16);
}
public static int getCCFromYesOrMaybe(final int norm16) {
return norm16 >= MIN_NORMAL_MAYBE_YES ? norm16 & 0xff : 0;
}
/**
* Returns the FCD data for code point c.
*
* @param c
* A Unicode code point.
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
*/
public int getFCD16(final int c) {
if (c < 0) {
return 0;
} else if (c < 0x180) {
return tccc180[c];
} else if (c <= 0xffff) {
if (!singleLeadMightHaveNonZeroFCD16(c)) {
return 0;
}
}
return getFCD16FromNormData(c);
}
/** Returns the FCD data for U+0000<=c<U+0180. */
public int getFCD16FromBelow180(final int c) {
return tccc180[c];
}
/** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
public boolean singleLeadMightHaveNonZeroFCD16(final int lead) {
// 0<=lead<=0xffff
byte bits = smallFCD[lead >> 8];
if (bits == 0) {
return false;
}
return ((bits >> ((lead >> 5) & 7)) & 1) != 0;
}
/** Gets the FCD value from the regular normalization data. */
public int getFCD16FromNormData(int c) {
// Only loops for 1:1 algorithmic mappings.
for (;;) {
int norm16 = getNorm16(c);
if (norm16 <= minYesNo) {
// no decomposition or Hangul syllable, all zeros
return 0;
} else if (norm16 >= MIN_NORMAL_MAYBE_YES) {
// combining mark
norm16 &= 0xff;
return norm16 | (norm16 << 8);
} else if (norm16 >= minMaybeYes) {
return 0;
} else if (isDecompNoAlgorithmic(norm16)) {
c = mapAlgorithmic(c, norm16);
} else {
// c decomposes, get everything from the variable-length extra data
int firstUnit = extraData.charAt(norm16);
if ((firstUnit & MAPPING_LENGTH_MASK) == 0) {
// A character that is deleted (maps to an empty string) must
// get the worst-case lccc and tccc values because arbitrary
// characters on both sides will become adjacent.
return 0x1ff;
} else {
int fcd16 = firstUnit >> 8; // tccc
if ((firstUnit & MAPPING_HAS_CCC_LCCC_WORD) != 0) {
fcd16 |= extraData.charAt(norm16 - 1) & 0xff00; // lccc
}
return fcd16;
}
}
}
}
/**
* Gets the decomposition for one code point.
*
* @param c
* code point
* @return c's decomposition, if it has one; returns null if it does not have a decomposition
*/
public String getDecomposition(int c) {
int decomp = -1;
int norm16;
for (;;) {
if (c < minDecompNoCP || isDecompYes(norm16 = getNorm16(c))) {
// c does not decompose
} else if (isHangul(norm16)) {
// Hangul syllable: decompose algorithmically
StringBuilder buffer = new StringBuilder();
Hangul.decompose(c, buffer);
return buffer.toString();
} else if (isDecompNoAlgorithmic(norm16)) {
decomp = c = mapAlgorithmic(c, norm16);
continue;
} else {
// c decomposes, get everything from the variable-length extra data
int length = extraData.charAt(norm16++) & MAPPING_LENGTH_MASK;
return extraData.substring(norm16, norm16 + length);
}
if (decomp < 0) {
return null;
} else {
return UTF16.valueOf(decomp);
}
}
}
/**
* Gets the raw decomposition for one code point.
*
* @param c
* code point
* @return c's raw decomposition, if it has one; returns null if it does not have a decomposition
*/
public String getRawDecomposition(final int c) {
// We do not loop in this method because an algorithmic mapping itself
// becomes a final result rather than having to be decomposed recursively.
int norm16;
if (c < minDecompNoCP || isDecompYes(norm16 = getNorm16(c))) {
// c does not decompose
return null;
} else if (isHangul(norm16)) {
// Hangul syllable: decompose algorithmically
StringBuilder buffer = new StringBuilder();
Hangul.getRawDecomposition(c, buffer);
return buffer.toString();
} else if (isDecompNoAlgorithmic(norm16)) {
return UTF16.valueOf(mapAlgorithmic(c, norm16));
} else {
// c decomposes, get everything from the variable-length extra data
int firstUnit = extraData.charAt(norm16);
int mLength = firstUnit & MAPPING_LENGTH_MASK; // length of normal mapping
if ((firstUnit & MAPPING_HAS_RAW_MAPPING) != 0) {
// Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
// Bit 7=MAPPING_HAS_CCC_LCCC_WORD
int rawMapping = norm16 - ((firstUnit >> 7) & 1) - 1;
char rm0 = extraData.charAt(rawMapping);
if (rm0 <= MAPPING_LENGTH_MASK) {
return extraData.substring(rawMapping - rm0, rawMapping);
} else {
// Copy the normal mapping and replace its first two code units with rm0.
StringBuilder buffer = new StringBuilder(mLength - 1).append(rm0);
norm16 += 1 + 2; // skip over the firstUnit and the first two mapping code units
return buffer.append(extraData, norm16, norm16 + mLength - 2).toString();
}
} else {
norm16 += 1; // skip over the firstUnit
return extraData.substring(norm16, norm16 + mLength);
}
}
}
/**
* Returns true if code point c starts a canonical-iterator string segment. <b>{@link #ensureCanonIterData()} must have been called
* before this method, or else this method will crash.</b>
*
* @param c
* A Unicode code point.
* @return true if c starts a canonical-iterator string segment.
*/
public boolean isCanonSegmentStarter(final int c) {
return canonIterData.get(c) >= 0;
}
/**
* Returns true if there are characters whose decomposition starts with c. If so, then the set is cleared and then filled with those
* characters. <b>{@link #ensureCanonIterData()} must have been called before this method, or else this method will crash.</b>
*
* @param c
* A Unicode code point.
* @param set
* A UnicodeSet to receive the characters whose decompositions start with c, if there are any.
* @return true if there are characters whose decomposition starts with c.
*/
public boolean getCanonStartSet(final int c, final UnicodeSet set) {
int canonValue = canonIterData.get(c) & ~CANON_NOT_SEGMENT_STARTER;
if (canonValue == 0) {
return false;
}
set.clear();
int value = canonValue & CANON_VALUE_MASK;
if ((canonValue & CANON_HAS_SET) != 0) {
set.addAll(canonStartSets.get(value));
} else if (value != 0) {
set.add(value);
}
if ((canonValue & CANON_HAS_COMPOSITIONS) != 0) {
int norm16 = getNorm16(c);
if (norm16 == JAMO_L) {
int syllable = Hangul.HANGUL_BASE + (c - Hangul.JAMO_L_BASE) * Hangul.JAMO_VT_COUNT;
set.add(syllable, syllable + Hangul.JAMO_VT_COUNT - 1);
} else {
addComposites(getCompositionsList(norm16), set);
}
}
return true;
}
public static final int MIN_CCC_LCCC_CP = 0x300;
public static final int MIN_YES_YES_WITH_CC = 0xff01;
public static final int JAMO_VT = 0xff00;
public static final int MIN_NORMAL_MAYBE_YES = 0xfe00;
public static final int JAMO_L = 1;
public static final int MAX_DELTA = 0x40;
// Byte offsets from the start of the data, after the generic header.
public static final int IX_NORM_TRIE_OFFSET = 0;
public static final int IX_EXTRA_DATA_OFFSET = 1;
public static final int IX_SMALL_FCD_OFFSET = 2;
public static final int IX_RESERVED3_OFFSET = 3;
public static final int IX_TOTAL_SIZE = 7;
// Code point thresholds for quick check codes.
public static final int IX_MIN_DECOMP_NO_CP = 8;
public static final int IX_MIN_COMP_NO_MAYBE_CP = 9;
// Norm16 value thresholds for quick check combinations and types of extra data.
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
public static final int IX_MIN_YES_NO = 10;
public static final int IX_MIN_NO_NO = 11;
public static final int IX_LIMIT_NO_NO = 12;
public static final int IX_MIN_MAYBE_YES = 13;
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
public static final int IX_MIN_YES_NO_MAPPINGS_ONLY = 14;
public static final int IX_COUNT = 16;
public static final int MAPPING_HAS_CCC_LCCC_WORD = 0x80;
public static final int MAPPING_HAS_RAW_MAPPING = 0x40;
public static final int MAPPING_NO_COMP_BOUNDARY_AFTER = 0x20;
public static final int MAPPING_LENGTH_MASK = 0x1f;
public static final int COMP_1_LAST_TUPLE = 0x8000;
public static final int COMP_1_TRIPLE = 1;
public static final int COMP_1_TRAIL_LIMIT = 0x3400;
public static final int COMP_1_TRAIL_MASK = 0x7ffe;
public static final int COMP_1_TRAIL_SHIFT = 9; // 10-1 for the "triple" bit
public static final int COMP_2_TRAIL_SHIFT = 6;
public static final int COMP_2_TRAIL_MASK = 0xffc0;
// higher-level functionality ------------------------------------------ ***
// Dual functionality:
// buffer!=NULL: normalize
// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
public int decompose(final CharSequence s, int src, final int limit, final ReorderingBuffer buffer) {
int minNoCP = minDecompNoCP;
int prevSrc;
int c = 0;
int norm16 = 0;
// only for quick check
int prevBoundary = src;
int prevCC = 0;
for (;;) {
// count code units below the minimum or with irrelevant data for the quick check
for (prevSrc = src; src != limit;) {
if ((c = s.charAt(src)) < minNoCP || isMostDecompYesAndZeroCC(norm16 = normTrie.getFromU16SingleLead((char) c))) {
++src;
} else if (!UTF16.isSurrogate((char) c)) {
break;
} else {
char c2;
if (UTF16Plus.isSurrogateLead(c)) {
if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
c = Character.toCodePoint((char) c, c2);
}
} else /* trail surrogate */{
if (prevSrc < src && Character.isHighSurrogate(c2 = s.charAt(src - 1))) {
--src;
c = Character.toCodePoint(c2, (char) c);
}
}
if (isMostDecompYesAndZeroCC(norm16 = getNorm16(c))) {
src += Character.charCount(c);
} else {
break;
}
}
}
// copy these code units all at once
if (src != prevSrc) {
if (buffer != null) {
buffer.flushAndAppendZeroCC(s, prevSrc, src);
} else {
prevCC = 0;
prevBoundary = src;
}
}
if (src == limit) {
break;
}
// Check one above-minimum, relevant code point.
src += Character.charCount(c);
if (buffer != null) {
decompose(c, norm16, buffer);
} else {
if (isDecompYes(norm16)) {
int cc = getCCFromYesOrMaybe(norm16);
if (prevCC <= cc || cc == 0) {
prevCC = cc;
if (cc <= 1) {
prevBoundary = src;
}
continue;
}
}
return prevBoundary; // "no" or cc out of order
}
}
return src;
}
public void decomposeAndAppend(final CharSequence s, final boolean doDecompose, final ReorderingBuffer buffer) {
int limit = s.length();
if (limit == 0) {
return;
}
if (doDecompose) {
decompose(s, 0, limit, buffer);
return;
}
// Just merge the strings at the boundary.
int c = Character.codePointAt(s, 0);
int src = 0;
int firstCC, prevCC, cc;
firstCC = prevCC = cc = getCC(getNorm16(c));
while (cc != 0) {
prevCC = cc;
src += Character.charCount(c);
if (src >= limit) {
break;
}
c = Character.codePointAt(s, src);
cc = getCC(getNorm16(c));
}
;
buffer.append(s, 0, src, firstCC, prevCC);
buffer.append(s, src, limit);
}
// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
// doCompose: normalize
// !doCompose: isNormalized (buffer must be empty and initialized)
public boolean compose(final CharSequence s, int src, final int limit, final boolean onlyContiguous, final boolean doCompose,
final ReorderingBuffer buffer) {
int minNoMaybeCP = minCompNoMaybeCP;
/*
* prevBoundary points to the last character before the current one
* that has a composition boundary before it with ccc==0 and quick check "yes".
* Keeping track of prevBoundary saves us looking for a composition boundary
* when we find a "no" or "maybe".
*
* When we back out from prevSrc back to prevBoundary,
* then we also remove those same characters (which had been simply copied
* or canonically-order-inserted) from the ReorderingBuffer.
* Therefore, at all times, the [prevBoundary..prevSrc[ source units
* must correspond 1:1 to destination units at the end of the destination buffer.
*/
int prevBoundary = src;
int prevSrc;
int c = 0;
int norm16 = 0;
// only for isNormalized
int prevCC = 0;
for (;;) {
// count code units below the minimum or with irrelevant data for the quick check
for (prevSrc = src; src != limit;) {
if ((c = s.charAt(src)) < minNoMaybeCP || isCompYesAndZeroCC(norm16 = normTrie.getFromU16SingleLead((char) c))) {
++src;
} else if (!UTF16.isSurrogate((char) c)) {
break;
} else {
char c2;
if (UTF16Plus.isSurrogateLead(c)) {
if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
c = Character.toCodePoint((char) c, c2);
}
} else /* trail surrogate */{
if (prevSrc < src && Character.isHighSurrogate(c2 = s.charAt(src - 1))) {
--src;
c = Character.toCodePoint(c2, (char) c);
}
}
if (isCompYesAndZeroCC(norm16 = getNorm16(c))) {
src += Character.charCount(c);
} else {
break;
}
}
}
// copy these code units all at once
if (src != prevSrc) {
if (src == limit) {
if (doCompose) {
buffer.flushAndAppendZeroCC(s, prevSrc, src);
}
break;
}
// Set prevBoundary to the last character in the quick check loop.
prevBoundary = src - 1;
if (Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc < prevBoundary
&& Character.isHighSurrogate(s.charAt(prevBoundary - 1))) {
--prevBoundary;
}
if (doCompose) {
// The last "quick check yes" character is excluded from the
// flush-and-append call in case it needs to be modified.
buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
buffer.append(s, prevBoundary, src);
} else {
prevCC = 0;
}
// The start of the current character (c).
prevSrc = src;
} else if (src == limit) {
break;
}
src += Character.charCount(c);
/*
* isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
* c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
* or has ccc!=0.
* Check for Jamo V/T, then for regular characters.
* c is not a Hangul syllable or Jamo L because those have "yes" properties.
*/
if (isJamoVT(norm16) && prevBoundary != prevSrc) {
char prev = s.charAt(prevSrc - 1);
boolean needToDecompose = false;
if (c < Hangul.JAMO_T_BASE) {
// c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
prev -= Hangul.JAMO_L_BASE;
if (prev < Hangul.JAMO_L_COUNT) {
if (!doCompose) {
return false;
}
char syllable = (char) (Hangul.HANGUL_BASE + (prev * Hangul.JAMO_V_COUNT + (c - Hangul.JAMO_V_BASE))
* Hangul.JAMO_T_COUNT);
char t;
if (src != limit && (t = (char) (s.charAt(src) - Hangul.JAMO_T_BASE)) < Hangul.JAMO_T_COUNT) {
++src;
syllable += t; // The next character was a Jamo T.
prevBoundary = src;
buffer.setLastChar(syllable);
continue;
}
// If we see L+V+x where x!=T then we drop to the slow path,
// decompose and recompose.
// This is to deal with NFKC finding normal L and V but a
// compatibility variant of a T. We need to either fully compose that
// combination here (which would complicate the code and may not work
// with strange custom data) or use the slow path -- or else our replacing
// two input characters (L+V) with one output character (LV syllable)
// would violate the invariant that [prevBoundary..prevSrc[ has the same
// length as what we appended to the buffer since prevBoundary.
needToDecompose = true;
}
} else if (Hangul.isHangulWithoutJamoT(prev)) {
// c is a Jamo Trailing consonant,
// compose with previous Hangul LV that does not contain a Jamo T.
if (!doCompose) {
return false;
}
buffer.setLastChar((char) (prev + c - Hangul.JAMO_T_BASE));
prevBoundary = src;
continue;
}
if (!needToDecompose) {
// The Jamo V/T did not compose into a Hangul syllable.
if (doCompose) {
buffer.append((char) c);
} else {
prevCC = 0;
}
continue;
}
}
/*
* Source buffer pointers:
*
* all done quick check current char not yet
* "yes" but (c) processed
* may combine
* forward
* [-------------[-------------[-------------[-------------[
* | | | | |
* orig. src prevBoundary prevSrc src limit
*
*
* Destination buffer pointers inside the ReorderingBuffer:
*
* all done might take not filled yet
* characters for
* reordering
* [-------------[-------------[-------------[
* | | | |
* start reorderStart limit |
* +remainingCap.+
*/
if (norm16 >= MIN_YES_YES_WITH_CC) {
int cc = norm16 & 0xff; // cc!=0
if (onlyContiguous && // FCC
(doCompose ? buffer.getLastCC() : prevCC) == 0 && prevBoundary < prevSrc &&
// buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
// [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
// passed the quick check "yes && ccc==0" test.
// Check whether the last character was a "yesYes" or a "yesNo".
// If a "yesNo", then we get its trailing ccc from its
// mapping and check for canonical order.
// All other cases are ok.
getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc) > cc) {
// Fails FCD test, need to decompose and contiguously recompose.
if (!doCompose) {
return false;
}
} else if (doCompose) {
buffer.append(c, cc);
continue;
} else if (prevCC <= cc) {
prevCC = cc;
continue;
} else {
return false;
}
} else if (!doCompose && !isMaybeOrNonZeroCC(norm16)) {
return false;
}
/*
* Find appropriate boundaries around this character,
* decompose the source text from between the boundaries,
* and recompose it.
*
* We may need to remove the last few characters from the ReorderingBuffer
* to account for source text that was copied or appended
* but needs to take part in the recomposition.
*/
/*
* Find the last composition boundary in [prevBoundary..src[.
* It is either the decomposition of the current character (at prevSrc),
* or prevBoundary.
*/
if (hasCompBoundaryBefore(c, norm16)) {
prevBoundary = prevSrc;
} else if (doCompose) {
buffer.removeSuffix(prevSrc - prevBoundary);
}
// Find the next composition boundary in [src..limit[ -
// modifies src to point to the next starter.
src = findNextCompBoundary(s, src, limit);
// Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
int recomposeStartIndex = buffer.length();
decomposeShort(s, prevBoundary, src, buffer);
recompose(buffer, recomposeStartIndex, onlyContiguous);
if (!doCompose) {
if (!buffer.equals(s, prevBoundary, src)) {
return false;
}
buffer.remove();
prevCC = 0;
}
// Move to the next starter. We never need to look back before this point again.
prevBoundary = src;
}
return true;
}
/**
* Very similar to compose(): Make the same changes in both places if relevant. doSpan: spanQuickCheckYes (ignore bit 0 of the return
* value) !doSpan: quickCheck
*
* @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and bit 0: set if "maybe"; otherwise, if the span length<s.length()
* then the quick check result is "no"
*/
public int composeQuickCheck(final CharSequence s, int src, final int limit, final boolean onlyContiguous, final boolean doSpan) {
int qcResult = 0;
int minNoMaybeCP = minCompNoMaybeCP;
/*
* prevBoundary points to the last character before the current one
* that has a composition boundary before it with ccc==0 and quick check "yes".
*/
int prevBoundary = src;
int prevSrc;
int c = 0;
int norm16 = 0;
int prevCC = 0;
for (;;) {
// count code units below the minimum or with irrelevant data for the quick check
for (prevSrc = src;;) {
if (src == limit) {
return (src << 1) | qcResult; // "yes" or "maybe"
}
if ((c = s.charAt(src)) < minNoMaybeCP || isCompYesAndZeroCC(norm16 = normTrie.getFromU16SingleLead((char) c))) {
++src;
} else if (!UTF16.isSurrogate((char) c)) {
break;
} else {
char c2;
if (UTF16Plus.isSurrogateLead(c)) {
if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
c = Character.toCodePoint((char) c, c2);
}
} else /* trail surrogate */{
if (prevSrc < src && Character.isHighSurrogate(c2 = s.charAt(src - 1))) {
--src;
c = Character.toCodePoint(c2, (char) c);
}
}
if (isCompYesAndZeroCC(norm16 = getNorm16(c))) {
src += Character.charCount(c);
} else {
break;
}
}
}
if (src != prevSrc) {
// Set prevBoundary to the last character in the quick check loop.
prevBoundary = src - 1;
if (Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc < prevBoundary
&& Character.isHighSurrogate(s.charAt(prevBoundary - 1))) {
--prevBoundary;
}
prevCC = 0;
// The start of the current character (c).
prevSrc = src;
}
src += Character.charCount(c);
/*
* isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
* c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
* or has ccc!=0.
*/
if (isMaybeOrNonZeroCC(norm16)) {
int cc = getCCFromYesOrMaybe(norm16);
if (onlyContiguous && // FCC
cc != 0 && prevCC == 0 && prevBoundary < prevSrc &&
// prevCC==0 && prevBoundary<prevSrc tell us that
// [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
// passed the quick check "yes && ccc==0" test.
// Check whether the last character was a "yesYes" or a "yesNo".
// If a "yesNo", then we get its trailing ccc from its
// mapping and check for canonical order.
// All other cases are ok.
getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc) > cc) {
// Fails FCD test.
} else if (prevCC <= cc || cc == 0) {
prevCC = cc;
if (norm16 < MIN_YES_YES_WITH_CC) {
if (!doSpan) {
qcResult = 1;
} else {
return prevBoundary << 1; // spanYes does not care to know it's "maybe"
}
}
continue;
}
}
return prevBoundary << 1; // "no"
}
}
public void composeAndAppend(final CharSequence s, final boolean doCompose, final boolean onlyContiguous, final ReorderingBuffer buffer) {
int src = 0, limit = s.length();
if (!buffer.isEmpty()) {
int firstStarterInSrc = findNextCompBoundary(s, 0, limit);
if (0 != firstStarterInSrc) {
int lastStarterInDest = findPreviousCompBoundary(buffer.getStringBuilder(), buffer.length());
StringBuilder middle = new StringBuilder((buffer.length() - lastStarterInDest) + firstStarterInSrc + 16);
middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
buffer.removeSuffix(buffer.length() - lastStarterInDest);
middle.append(s, 0, firstStarterInSrc);
compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
src = firstStarterInSrc;
}
}
if (doCompose) {
compose(s, src, limit, onlyContiguous, true, buffer);
} else {
buffer.append(s, src, limit);
}
}
// Dual functionality:
// buffer!=NULL: normalize
// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
public int makeFCD(final CharSequence s, int src, final int limit, final ReorderingBuffer buffer) {
// Note: In this function we use buffer->appendZeroCC() because we track
// the lead and trail combining classes here, rather than leaving it to
// the ReorderingBuffer.
// The exception is the call to decomposeShort() which uses the buffer
// in the normal way.
// Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
// Similar to the prevBoundary in the compose() implementation.
int prevBoundary = src;
int prevSrc;
int c = 0;
int prevFCD16 = 0;
int fcd16 = 0;
for (;;) {
// count code units with lccc==0
for (prevSrc = src; src != limit;) {
if ((c = s.charAt(src)) < MIN_CCC_LCCC_CP) {
prevFCD16 = ~c;
++src;
} else if (!singleLeadMightHaveNonZeroFCD16(c)) {
prevFCD16 = 0;
++src;
} else {
if (UTF16.isSurrogate((char) c)) {
char c2;
if (UTF16Plus.isSurrogateLead(c)) {
if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
c = Character.toCodePoint((char) c, c2);
}
} else /* trail surrogate */{
if (prevSrc < src && Character.isHighSurrogate(c2 = s.charAt(src - 1))) {
--src;
c = Character.toCodePoint(c2, (char) c);
}
}
}
if ((fcd16 = getFCD16FromNormData(c)) <= 0xff) {
prevFCD16 = fcd16;
src += Character.charCount(c);
} else {
break;
}
}
}
// copy these code units all at once
if (src != prevSrc) {
if (src == limit) {
if (buffer != null) {
buffer.flushAndAppendZeroCC(s, prevSrc, src);
}
break;
}
prevBoundary = src;
// We know that the previous character's lccc==0.
if (prevFCD16 < 0) {
// Fetching the fcd16 value was deferred for this below-U+0300 code point.
int prev = ~prevFCD16;
prevFCD16 = prev < 0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
if (prevFCD16 > 1) {
--prevBoundary;
}
} else {
int p = src - 1;
if (Character.isLowSurrogate(s.charAt(p)) && prevSrc < p && Character.isHighSurrogate(s.charAt(p - 1))) {
--p;
// Need to fetch the previous character's FCD value because
// prevFCD16 was just for the trail surrogate code point.
prevFCD16 = getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p + 1)));
// Still known to have lccc==0 because its lead surrogate unit had lccc==0.
}
if (prevFCD16 > 1) {
prevBoundary = p;
}
}
if (buffer != null) {
// The last lccc==0 character is excluded from the
// flush-and-append call in case it needs to be modified.
buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
buffer.append(s, prevBoundary, src);
}
// The start of the current character (c).
prevSrc = src;
} else if (src == limit) {
break;
}
src += Character.charCount(c);
// The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
// Check for proper order, and decompose locally if necessary.
if ((prevFCD16 & 0xff) <= (fcd16 >> 8)) {
// proper order: prev tccc <= current lccc
if ((fcd16 & 0xff) <= 1) {
prevBoundary = src;
}
if (buffer != null) {
buffer.appendZeroCC(c);
}
prevFCD16 = fcd16;
continue;
} else if (buffer == null) {
return prevBoundary; // quick check "no"
} else {
/*
* Back out the part of the source that we copied or appended
* already but is now going to be decomposed.
* prevSrc is set to after what was copied/appended.
*/
buffer.removeSuffix(prevSrc - prevBoundary);
/*
* Find the part of the source that needs to be decomposed,
* up to the next safe boundary.
*/
src = findNextFCDBoundary(s, src, limit);
/*
* The source text does not fulfill the conditions for FCD.
* Decompose and reorder a limited piece of the text.
*/
decomposeShort(s, prevBoundary, src, buffer);
prevBoundary = src;
prevFCD16 = 0;
}
}
return src;
}
public void makeFCDAndAppend(final CharSequence s, final boolean doMakeFCD, final ReorderingBuffer buffer) {
int src = 0, limit = s.length();
if (!buffer.isEmpty()) {
int firstBoundaryInSrc = findNextFCDBoundary(s, 0, limit);
if (0 != firstBoundaryInSrc) {
int lastBoundaryInDest = findPreviousFCDBoundary(buffer.getStringBuilder(), buffer.length());
StringBuilder middle = new StringBuilder((buffer.length() - lastBoundaryInDest) + firstBoundaryInSrc + 16);
middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());
buffer.removeSuffix(buffer.length() - lastBoundaryInDest);
middle.append(s, 0, firstBoundaryInSrc);
makeFCD(middle, 0, middle.length(), buffer);
src = firstBoundaryInSrc;
}
}
if (doMakeFCD) {
makeFCD(s, src, limit, buffer);
} else {
buffer.append(s, src, limit);
}
}
// Note: hasDecompBoundary() could be implemented as aliases to
// hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
// at the cost of building the FCD trie for a decomposition normalizer.
public boolean hasDecompBoundary(int c, final boolean before) {
for (;;) {
if (c < minDecompNoCP) {
return true;
}
int norm16 = getNorm16(c);
if (isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
return true;
} else if (norm16 > MIN_NORMAL_MAYBE_YES) {
return false; // ccc!=0
} else if (isDecompNoAlgorithmic(norm16)) {
c = mapAlgorithmic(c, norm16);
} else {
// c decomposes, get everything from the variable-length extra data
int firstUnit = extraData.charAt(norm16);
if ((firstUnit & MAPPING_LENGTH_MASK) == 0) {
return false;
}
if (!before) {
// decomp after-boundary: same as hasFCDBoundaryAfter(),
// fcd16<=1 || trailCC==0
if (firstUnit > 0x1ff) {
return false; // trailCC>1
}
if (firstUnit <= 0xff) {
return true; // trailCC==0
}
// if(trailCC==1) test leadCC==0, same as checking for before-boundary
}
// true if leadCC==0 (hasFCDBoundaryBefore())
return (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) == 0 || (extraData.charAt(norm16 - 1) & 0xff00) == 0;
}
}
}
public boolean isDecompInert(final int c) {
return isDecompYesAndZeroCC(getNorm16(c));
}
public boolean hasCompBoundaryBefore(final int c) {
return c < minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
}
public boolean hasCompBoundaryAfter(int c, final boolean onlyContiguous, final boolean testInert) {
for (;;) {
int norm16 = getNorm16(c);
if (isInert(norm16)) {
return true;
} else if (norm16 <= minYesNo) {
// Hangul: norm16==minYesNo
// Hangul LVT has a boundary after it.
// Hangul LV and non-inert yesYes characters combine forward.
return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char) c);
} else if (norm16 >= (testInert ? minNoNo : minMaybeYes)) {
return false;
} else if (isDecompNoAlgorithmic(norm16)) {
c = mapAlgorithmic(c, norm16);
} else {
// c decomposes, get everything from the variable-length extra data.
// If testInert, then c must be a yesNo character which has lccc=0,
// otherwise it could be a noNo.
int firstUnit = extraData.charAt(norm16);
// true if
// not MAPPING_NO_COMP_BOUNDARY_AFTER
// (which is set if
// c is not deleted, and
// it and its decomposition do not combine forward, and it has a starter)
// and if FCC then trailCC<=1
return (firstUnit & MAPPING_NO_COMP_BOUNDARY_AFTER) == 0 && (!onlyContiguous || firstUnit <= 0x1ff);
}
}
}
public boolean hasFCDBoundaryBefore(final int c) {
return c < MIN_CCC_LCCC_CP || getFCD16(c) <= 0xff;
}
public boolean hasFCDBoundaryAfter(final int c) {
int fcd16 = getFCD16(c);
return fcd16 <= 1 || (fcd16 & 0xff) == 0;
}
public boolean isFCDInert(final int c) {
return getFCD16(c) <= 1;
}
private boolean isMaybe(final int norm16) {
return minMaybeYes <= norm16 && norm16 <= JAMO_VT;
}
private boolean isMaybeOrNonZeroCC(final int norm16) {
return norm16 >= minMaybeYes;
}
private static boolean isInert(final int norm16) {
return norm16 == 0;
}
private static boolean isJamoL(final int norm16) {
return norm16 == 1;
}
private static boolean isJamoVT(final int norm16) {
return norm16 == JAMO_VT;
}
private boolean isHangul(final int norm16) {
return norm16 == minYesNo;
}
private boolean isCompYesAndZeroCC(final int norm16) {
return norm16 < minNoNo;
}
// UBool isCompYes(uint16_t norm16) const {
// return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
// }
// UBool isCompYesOrMaybe(uint16_t norm16) const {
// return norm16<minNoNo || minMaybeYes<=norm16;
// }
// private boolean hasZeroCCFromDecompYes(int norm16) {
// return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
// }
private boolean isDecompYesAndZeroCC(final int norm16) {
return norm16 < minYesNo || norm16 == JAMO_VT || (minMaybeYes <= norm16 && norm16 <= MIN_NORMAL_MAYBE_YES);
}
/**
* A little faster and simpler than isDecompYesAndZeroCC() but does not include the MaybeYes which combine-forward and have ccc=0.
* (Standard Unicode 5.2 normalization does not have such characters.)
*/
private boolean isMostDecompYesAndZeroCC(final int norm16) {
return norm16 < minYesNo || norm16 == MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
}
private boolean isDecompNoAlgorithmic(final int norm16) {
return norm16 >= limitNoNo;
}
// For use with isCompYes().
// Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
// static uint8_t getCCFromYes(uint16_t norm16) {
// return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
// }
private int getCCFromNoNo(final int norm16) {
if ((extraData.charAt(norm16) & MAPPING_HAS_CCC_LCCC_WORD) != 0) {
return extraData.charAt(norm16 - 1) & 0xff;
} else {
return 0;
}
}
// requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
int getTrailCCFromCompYesAndZeroCC(final CharSequence s, final int cpStart, final int cpLimit) {
int c;
if (cpStart == (cpLimit - 1)) {
c = s.charAt(cpStart);
} else {
c = Character.codePointAt(s, cpStart);
}
int prevNorm16 = getNorm16(c);
if (prevNorm16 <= minYesNo) {
return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
} else {
return extraData.charAt(prevNorm16) >> 8; // tccc from yesNo
}
}
// Requires algorithmic-NoNo.
private int mapAlgorithmic(final int c, final int norm16) {
return c + norm16 - (minMaybeYes - MAX_DELTA - 1);
}
// Requires minYesNo<norm16<limitNoNo.
// private int getMapping(int norm16) { return /*extraData+*/norm16; }
/**
* @return index into maybeYesCompositions, or -1
*/
private int getCompositionsListForDecompYes(int norm16) {
if (norm16 == 0 || MIN_NORMAL_MAYBE_YES <= norm16) {
return -1;
} else {
if ((norm16 -= minMaybeYes) < 0) {
// norm16<minMaybeYes: index into extraData which is a substring at
// maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
// same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
norm16 += MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list
}
return norm16;
}
}
/**
* @return index into maybeYesCompositions
*/
private int getCompositionsListForComposite(final int norm16) {
// composite has both mapping & compositions list
int firstUnit = extraData.charAt(norm16);
return (MIN_NORMAL_MAYBE_YES - minMaybeYes) + norm16 + // mapping in maybeYesCompositions
1 + // +1 to skip the first unit with the mapping lenth
(firstUnit & MAPPING_LENGTH_MASK); // + mapping length
}
/**
* @param c
* code point must have compositions
* @return index into maybeYesCompositions
*/
private int getCompositionsList(final int norm16) {
return isDecompYes(norm16) ? getCompositionsListForDecompYes(norm16) : getCompositionsListForComposite(norm16);
}
// Decompose a short piece of text which is likely to contain characters that
// fail the quick check loop and/or where the quick check loop's overhead
// is unlikely to be amortized.
// Called by the compose() and makeFCD() implementations.
// Public in Java for collation implementation code.
public void decomposeShort(final CharSequence s, int src, final int limit, final ReorderingBuffer buffer) {
while (src < limit) {
int c = Character.codePointAt(s, src);
src += Character.charCount(c);
decompose(c, getNorm16(c), buffer);
}
}
private void decompose(int c, int norm16, final ReorderingBuffer buffer) {
// Only loops for 1:1 algorithmic mappings.
for (;;) {
// get the decomposition and the lead and trail cc's
if (isDecompYes(norm16)) {
// c does not decompose
buffer.append(c, getCCFromYesOrMaybe(norm16));
} else if (isHangul(norm16)) {
// Hangul syllable: decompose algorithmically
Hangul.decompose(c, buffer);
} else if (isDecompNoAlgorithmic(norm16)) {
c = mapAlgorithmic(c, norm16);
norm16 = getNorm16(c);
continue;
} else {
// c decomposes, get everything from the variable-length extra data
int firstUnit = extraData.charAt(norm16);
int length = firstUnit & MAPPING_LENGTH_MASK;
int leadCC, trailCC;
trailCC = firstUnit >> 8;
if ((firstUnit & MAPPING_HAS_CCC_LCCC_WORD) != 0) {
leadCC = extraData.charAt(norm16 - 1) >> 8;
} else {
leadCC = 0;
}
++norm16; // skip over the firstUnit
buffer.append(extraData, norm16, norm16 + length, leadCC, trailCC);
}
return;
}
}
/**
* Finds the recomposition result for a forward-combining "lead" character, specified with a pointer to its compositions list, and a
* backward-combining "trail" character.
*
* <p>
* If the lead and trail characters combine, then this function returns the following "compositeAndFwd" value:
*
* <pre>
* Bits 21..1 composite character
* Bit 0 set if the composite is a forward-combining starter
* </pre>
*
* otherwise it returns -1.
*
* <p>
* The compositions list has (trail, compositeAndFwd) pair entries, encoded as either pairs or triples of 16-bit units. The last entry
* has the high bit of its first unit set.
*
* <p>
* The list is sorted by ascending trail characters (there are no duplicates). A linear search is used.
*
* <p>
* See normalizer2impl.h for a more detailed description of the compositions list format.
*/
private static int combine(final String compositions, int list, final int trail) {
int key1, firstUnit;
if (trail < COMP_1_TRAIL_LIMIT) {
// trail character is 0..33FF
// result entry may have 2 or 3 units
key1 = (trail << 1);
while (key1 > (firstUnit = compositions.charAt(list))) {
list += 2 + (firstUnit & COMP_1_TRIPLE);
}
if (key1 == (firstUnit & COMP_1_TRAIL_MASK)) {
if ((firstUnit & COMP_1_TRIPLE) != 0) {
return ((int) compositions.charAt(list + 1) << 16) | compositions.charAt(list + 2);
} else {
return compositions.charAt(list + 1);
}
}
} else {
// trail character is 3400..10FFFF
// result entry has 3 units
key1 = COMP_1_TRAIL_LIMIT + (((trail >> COMP_1_TRAIL_SHIFT)) & ~COMP_1_TRIPLE);
int key2 = (trail << COMP_2_TRAIL_SHIFT) & 0xffff;
int secondUnit;
for (;;) {
if (key1 > (firstUnit = compositions.charAt(list))) {
list += 2 + (firstUnit & COMP_1_TRIPLE);
} else if (key1 == (firstUnit & COMP_1_TRAIL_MASK)) {
if (key2 > (secondUnit = compositions.charAt(list + 1))) {
if ((firstUnit & COMP_1_LAST_TUPLE) != 0) {
break;
} else {
list += 3;
}
} else if (key2 == (secondUnit & COMP_2_TRAIL_MASK)) {
return ((secondUnit & ~COMP_2_TRAIL_MASK) << 16) | compositions.charAt(list + 2);
} else {
break;
}
} else {
break;
}
}
}
return -1;
}
/**
* @param list
* some character's compositions list
* @param set
* recursively receives the composites from these compositions
*/
private void addComposites(int list, final UnicodeSet set) {
int firstUnit, compositeAndFwd;
do {
firstUnit = maybeYesCompositions.charAt(list);
if ((firstUnit & COMP_1_TRIPLE) == 0) {
compositeAndFwd = maybeYesCompositions.charAt(list + 1);
list += 2;
} else {
compositeAndFwd = (((int) maybeYesCompositions.charAt(list + 1) & ~COMP_2_TRAIL_MASK) << 16)
| maybeYesCompositions.charAt(list + 2);
list += 3;
}
int composite = compositeAndFwd >> 1;
if ((compositeAndFwd & 1) != 0) {
addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
}
set.add(composite);
} while ((firstUnit & COMP_1_LAST_TUPLE) == 0);
}
/*
* Recomposes the buffer text starting at recomposeStartIndex
* (which is in NFD - decomposed and canonically ordered),
* and truncates the buffer contents.
*
* Note that recomposition never lengthens the text:
* Any character consists of either one or two code units;
* a composition may contain at most one more code unit than the original starter,
* while the combining mark that is removed has at least one code unit.
*/
private void recompose(final ReorderingBuffer buffer, final int recomposeStartIndex, final boolean onlyContiguous) {
StringBuilder sb = buffer.getStringBuilder();
int p = recomposeStartIndex;
if (p == sb.length()) {
return;
}
int starter, pRemove;
int compositionsList;
int c, compositeAndFwd;
int norm16;
int cc, prevCC;
boolean starterIsSupplementary;
// Some of the following variables are not used until we have a forward-combining starter
// and are only initialized now to avoid compiler warnings.
compositionsList = -1; // used as indicator for whether we have a forward-combining starter
starter = -1;
starterIsSupplementary = false;
prevCC = 0;
for (;;) {
c = sb.codePointAt(p);
p += Character.charCount(c);
norm16 = getNorm16(c);
cc = getCCFromYesOrMaybe(norm16);
if ( // this character combines backward and
isMaybe(norm16) &&
// we have seen a starter that combines forward and
compositionsList >= 0 &&
// the backward-combining character is not blocked
(prevCC < cc || prevCC == 0)) {
if (isJamoVT(norm16)) {
// c is a Jamo V/T, see if we can compose it with the previous character.
if (c < Hangul.JAMO_T_BASE) {
// c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
char prev = (char) (sb.charAt(starter) - Hangul.JAMO_L_BASE);
if (prev < Hangul.JAMO_L_COUNT) {
pRemove = p - 1;
char syllable = (char) (Hangul.HANGUL_BASE + (prev * Hangul.JAMO_V_COUNT + (c - Hangul.JAMO_V_BASE))
* Hangul.JAMO_T_COUNT);
char t;
if (p != sb.length() && (t = (char) (sb.charAt(p) - Hangul.JAMO_T_BASE)) < Hangul.JAMO_T_COUNT) {
++p;
syllable += t; // The next character was a Jamo T.
}
sb.setCharAt(starter, syllable);
// remove the Jamo V/T
sb.delete(pRemove, p);
p = pRemove;
}
}
/*
* No "else" for Jamo T:
* Since the input is in NFD, there are no Hangul LV syllables that
* a Jamo T could combine with.
* All Jamo Ts are combined above when handling Jamo Vs.
*/
if (p == sb.length()) {
break;
}
compositionsList = -1;
continue;
} else if ((compositeAndFwd = combine(maybeYesCompositions, compositionsList, c)) >= 0) {
// The starter and the combining mark (c) do combine.
int composite = compositeAndFwd >> 1;
// Remove the combining mark.
pRemove = p - Character.charCount(c); // pRemove & p: start & limit of the combining mark
sb.delete(pRemove, p);
p = pRemove;
// Replace the starter with the composite.
if (starterIsSupplementary) {
if (composite > 0xffff) {
// both are supplementary
sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
sb.setCharAt(starter + 1, UTF16.getTrailSurrogate(composite));
} else {
sb.setCharAt(starter, (char) c);
sb.deleteCharAt(starter + 1);
// The composite is shorter than the starter,
// move the intermediate characters forward one.
starterIsSupplementary = false;
--p;
}
} else if (composite > 0xffff) {
// The composite is longer than the starter,
// move the intermediate characters back one.
starterIsSupplementary = true;
sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
sb.insert(starter + 1, UTF16.getTrailSurrogate(composite));
++p;
} else {
// both are on the BMP
sb.setCharAt(starter, (char) composite);
}
// Keep prevCC because we removed the combining mark.
if (p == sb.length()) {
break;
}
// Is the composite a starter that combines forward?
if ((compositeAndFwd & 1) != 0) {
compositionsList = getCompositionsListForComposite(getNorm16(composite));
} else {
compositionsList = -1;
}
// We combined; continue with looking for compositions.
continue;
}
}
// no combination this time
prevCC = cc;
if (p == sb.length()) {
break;
}
// If c did not combine, then check if it is a starter.
if (cc == 0) {
// Found a new starter.
if ((compositionsList = getCompositionsListForDecompYes(norm16)) >= 0) {
// It may combine with something, prepare for it.
if (c <= 0xffff) {
starterIsSupplementary = false;
starter = p - 1;
} else {
starterIsSupplementary = true;
starter = p - 2;
}
}
} else if (onlyContiguous) {
// FCC: no discontiguous compositions; any intervening character blocks.
compositionsList = -1;
}
}
buffer.flush();
}
public int composePair(final int a, int b) {
int norm16 = getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
int list;
if (isInert(norm16)) {
return -1;
} else if (norm16 < minYesNoMappingsOnly) {
if (isJamoL(norm16)) {
b -= Hangul.JAMO_V_BASE;
if (0 <= b && b < Hangul.JAMO_V_COUNT) {
return (Hangul.HANGUL_BASE + ((a - Hangul.JAMO_L_BASE) * Hangul.JAMO_V_COUNT + b) * Hangul.JAMO_T_COUNT);
} else {
return -1;
}
} else if (isHangul(norm16)) {
b -= Hangul.JAMO_T_BASE;
if (Hangul.isHangulWithoutJamoT((char) a) && 0 < b && b < Hangul.JAMO_T_COUNT) { // not b==0!
return a + b;
} else {
return -1;
}
} else {
// 'a' has a compositions list in extraData
list = norm16;
if (norm16 > minYesNo) { // composite 'a' has both mapping & compositions list
list += // mapping pointer
1 + // +1 to skip the first unit with the mapping lenth
(extraData.charAt(list) & MAPPING_LENGTH_MASK); // + mapping length
}
// Turn the offset-into-extraData into an offset-into-maybeYesCompositions.
list += MIN_NORMAL_MAYBE_YES - minMaybeYes;
}
} else if (norm16 < minMaybeYes || MIN_NORMAL_MAYBE_YES <= norm16) {
return -1;
} else {
list = norm16 - minMaybeYes; // offset into maybeYesCompositions
}
if (b < 0 || 0x10ffff < b) { // combine(list, b) requires a valid code point b
return -1;
}
return combine(maybeYesCompositions, list, b) >> 1;
}
/**
* Does c have a composition boundary before it? True if its decomposition begins with a character that has ccc=0 && NFC_QC=Yes
* (isCompYesAndZeroCC()). As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()) so we need not
* decompose.
*/
private boolean hasCompBoundaryBefore(int c, int norm16) {
for (;;) {
if (isCompYesAndZeroCC(norm16)) {
return true;
} else if (isMaybeOrNonZeroCC(norm16)) {
return false;
} else if (isDecompNoAlgorithmic(norm16)) {
c = mapAlgorithmic(c, norm16);
norm16 = getNorm16(c);
} else {
// c decomposes, get everything from the variable-length extra data
int firstUnit = extraData.charAt(norm16);
if ((firstUnit & MAPPING_LENGTH_MASK) == 0) {
return false;
}
if ((firstUnit & MAPPING_HAS_CCC_LCCC_WORD) != 0 && (extraData.charAt(norm16 - 1) & 0xff00) != 0) {
return false; // non-zero leadCC
}
return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16 + 1)));
}
}
}
private int findPreviousCompBoundary(final CharSequence s, int p) {
while (p > 0) {
int c = Character.codePointBefore(s, p);
p -= Character.charCount(c);
if (hasCompBoundaryBefore(c)) {
break;
}
// We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
// but that's probably not worth the extra cost.
}
return p;
}
private int findNextCompBoundary(final CharSequence s, int p, final int limit) {
while (p < limit) {
int c = Character.codePointAt(s, p);
int norm16 = normTrie.get(c);
if (hasCompBoundaryBefore(c, norm16)) {
break;
}
p += Character.charCount(c);
}
return p;
}
private int findPreviousFCDBoundary(final CharSequence s, int p) {
while (p > 0) {
int c = Character.codePointBefore(s, p);
p -= Character.charCount(c);
if (c < MIN_CCC_LCCC_CP || getFCD16(c) <= 0xff) {
break;
}
}
return p;
}
private int findNextFCDBoundary(final CharSequence s, int p, final int limit) {
while (p < limit) {
int c = Character.codePointAt(s, p);
if (c < MIN_CCC_LCCC_CP || getFCD16(c) <= 0xff) {
break;
}
p += Character.charCount(c);
}
return p;
}
private void addToStartSet(final Trie2Writable newData, final int origin, final int decompLead) {
int canonValue = newData.get(decompLead);
if ((canonValue & (CANON_HAS_SET | CANON_VALUE_MASK)) == 0 && origin != 0) {
// origin is the first character whose decomposition starts with
// the character for which we are setting the value.
newData.set(decompLead, canonValue | origin);
} else {
// origin is not the first character, or it is U+0000.
UnicodeSet set;
if ((canonValue & CANON_HAS_SET) == 0) {
int firstOrigin = canonValue & CANON_VALUE_MASK;
canonValue = (canonValue & ~CANON_VALUE_MASK) | CANON_HAS_SET | canonStartSets.size();
newData.set(decompLead, canonValue);
canonStartSets.add(set = new UnicodeSet());
if (firstOrigin != 0) {
set.add(firstOrigin);
}
} else {
set = canonStartSets.get(canonValue & CANON_VALUE_MASK);
}
set.add(origin);
}
}
private VersionInfo dataVersion;
// Code point thresholds for quick check codes.
private int minDecompNoCP;
private int minCompNoMaybeCP;
// Norm16 value thresholds for quick check combinations and types of extra data.
private int minYesNo;
private int minYesNoMappingsOnly;
private int minNoNo;
private int limitNoNo;
private int minMaybeYes;
private Trie2_16 normTrie;
private String maybeYesCompositions;
private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
private int[] tccc180; // [0x180] tccc values for U+0000..U+017F
private Trie2_32 canonIterData;
private ArrayList<UnicodeSet> canonStartSets;
// bits in canonIterData
private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
private static final int CANON_HAS_SET = 0x200000;
private static final int CANON_VALUE_MASK = 0x1fffff;
}