Source Code of net.sourceforge.pinyin4j.PinyinFormatter

/**
 * This file is part of pinyin4j (http://sourceforge.net/projects/pinyin4j/) and distributed under
 * GNU GENERAL PUBLIC LICENSE (GPL).
 * 
 * pinyin4j is free software; you can redistribute it and/or modify it under the terms of the GNU
 * General Public License as published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 * 
 * pinyin4j is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with pinyin4j.
 */


package net.sourceforge.pinyin4j;


import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;


/**
 * Contains logic to format given Pinyin string
 * 
 * @author Li Min (xmlerlimin@gmail.com)
 * 
 */
class PinyinFormatter {
  /**
   * @param pinyinStr
   *            unformatted Hanyu Pinyin string
   * @param outputFormat
   *            given format of Hanyu Pinyin
   * @return formatted Hanyu Pinyin string
   * @throws BadHanyuPinyinOutputFormatCombination
   */
  static String formatHanyuPinyin(String pinyinStr, HanyuPinyinOutputFormat outputFormat)
      throws BadHanyuPinyinOutputFormatCombination {
    if ((HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType())
        && ((HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) || (HanyuPinyinVCharType.WITH_U_AND_COLON == outputFormat
            .getVCharType()))) {
      throw new BadHanyuPinyinOutputFormatCombination("tone marks cannot be added to v or u:");
    }


    if (HanyuPinyinToneType.WITHOUT_TONE == outputFormat.getToneType()) {
      pinyinStr = pinyinStr.replaceAll("[1-5]", "");
    } else if (HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType()) {
      pinyinStr = pinyinStr.replaceAll("u:", "v");
      pinyinStr = convertToneNumber2ToneMark(pinyinStr);
    }


    if (HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) {
      pinyinStr = pinyinStr.replaceAll("u:", "v");
    } else if (HanyuPinyinVCharType.WITH_U_UNICODE == outputFormat.getVCharType()) {
      pinyinStr = pinyinStr.replaceAll("u:", "ü");
    }


    if (HanyuPinyinCaseType.UPPERCASE == outputFormat.getCaseType()) {
      pinyinStr = pinyinStr.toUpperCase();
    }
    return pinyinStr;
  }


  /**
   * Convert tone numbers to tone marks using Unicode <br/><br/>
   * 
   * <b>Algorithm for determining location of tone mark</b><br/>
   * 
   * A simple algorithm for determining the vowel on which the tone mark
   * appears is as follows:<br/>
   * 
   * <ol>
   * <li>First, look for an "a" or an "e". If either vowel appears, it takes
   * the tone mark. There are no possible pinyin syllables that contain both
   * an "a" and an "e".
   * 
   * <li>If there is no "a" or "e", look for an "ou". If "ou" appears, then
   * the "o" takes the tone mark.
   * 
   * <li>If none of the above cases hold, then the last vowel in the syllable
   * takes the tone mark.
   * 
   * </ol>
   * 
   * @param pinyinStr
   *            the ascii represention with tone numbers
   * @return the unicode represention with tone marks
   */
  private static String convertToneNumber2ToneMark(final String pinyinStr) {
    String lowerCasePinyinStr = pinyinStr.toLowerCase();


    if (lowerCasePinyinStr.matches("[a-z]*[1-5]?")) {
      final char defautlCharValue = '$';
      final int defautlIndexValue = -1;


      char unmarkedVowel = defautlCharValue;
      int indexOfUnmarkedVowel = defautlIndexValue;


      final char charA = 'a';
      final char charE = 'e';
      final String ouStr = "ou";
      final String allUnmarkedVowelStr = "aeiouv";
      final String allMarkedVowelStr = "āáăàaēéĕèeīíĭìiōóŏòoūúŭùuǖǘǚǜü";


      if (lowerCasePinyinStr.matches("[a-z]*[1-5]")) {


        int tuneNumber =
            Character.getNumericValue(lowerCasePinyinStr.charAt(lowerCasePinyinStr.length() - 1));


        int indexOfA = lowerCasePinyinStr.indexOf(charA);
        int indexOfE = lowerCasePinyinStr.indexOf(charE);
        int ouIndex = lowerCasePinyinStr.indexOf(ouStr);


        if (-1 != indexOfA) {
          indexOfUnmarkedVowel = indexOfA;
          unmarkedVowel = charA;
        } else if (-1 != indexOfE) {
          indexOfUnmarkedVowel = indexOfE;
          unmarkedVowel = charE;
        } else if (-1 != ouIndex) {
          indexOfUnmarkedVowel = ouIndex;
          unmarkedVowel = ouStr.charAt(0);
        } else {
          for (int i = lowerCasePinyinStr.length() - 1; i >= 0; i--) {
            if (String.valueOf(lowerCasePinyinStr.charAt(i)).matches(
                "[" + allUnmarkedVowelStr + "]")) {
              indexOfUnmarkedVowel = i;
              unmarkedVowel = lowerCasePinyinStr.charAt(i);
              break;
            }
          }
        }


        if ((defautlCharValue != unmarkedVowel) && (defautlIndexValue != indexOfUnmarkedVowel)) {
          int rowIndex = allUnmarkedVowelStr.indexOf(unmarkedVowel);
          int columnIndex = tuneNumber - 1;


          int vowelLocation = rowIndex * 5 + columnIndex;


          char markedVowel = allMarkedVowelStr.charAt(vowelLocation);


          StringBuffer resultBuffer = new StringBuffer();


          resultBuffer.append(lowerCasePinyinStr.substring(0, indexOfUnmarkedVowel).replaceAll("v",
              "ü"));
          resultBuffer.append(markedVowel);
          resultBuffer.append(lowerCasePinyinStr.substring(indexOfUnmarkedVowel + 1,
              lowerCasePinyinStr.length() - 1).replaceAll("v", "ü"));


          return resultBuffer.toString();


        } else
        // error happens in the procedure of locating vowel
        {
          return lowerCasePinyinStr;
        }
      } else
      // input string has no any tune number
      {
        // only replace v with ü (umlat) character
        return lowerCasePinyinStr.replaceAll("v", "ü");
      }
    } else
    // bad format
    {
      return lowerCasePinyinStr;
    }
  }
}
Source Code of net.sourceforge.pinyin4j.PinyinFormatter

Related Classes of net.sourceforge.pinyin4j.PinyinFormatter