Package be.frma.langguess

Source Code of be.frma.langguess.LangProfileFactory

/*
* Copyright 2011 Francois ROLAND
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package be.frma.langguess;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.cybozu.labs.langdetect.util.LangProfile;

public class LangProfileFactory {
  private static final Pattern FREQ_PATTERN = Pattern.compile("\"freq\" ?: ?\\{(.+?)\\}");
  private static final Pattern N_WORDS_PATTERN = Pattern.compile("\"n_words\" ?: ?\\[(.+?)\\]");
  private static final Pattern NAME_PATTERN = Pattern.compile("\"name\" ?: ?\"(.+?)\"");
 
  public static LangProfile readProfile(InputStream input) throws IOException {
    StringBuilder buffer = new StringBuilder();
    BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName("utf-8")));
    String line = null;
    while((line = reader.readLine()) != null) {
      if (buffer.length() > 0) {
        buffer.append(' ');
      }
      buffer.append(line);
    }
    String storedProfile = buffer.toString();
    LangProfile langProfile = new LangProfile();

    Matcher m = FREQ_PATTERN.matcher(storedProfile);
    if (m.find()) {
      String[] entries = m.group(1).split(",");
      for (String entry : entries) {
        String[] keyValue = entry.split(":");
        String label = keyValue[0].trim().replace("\"", "");
        langProfile.getFreq().put(label, Integer.valueOf(keyValue[1]));
      }
    }

    m = N_WORDS_PATTERN.matcher(storedProfile);
    if (m.find()) {
      String[] nWords = m.group(1).split(",");
      langProfile.setNWords(new int[nWords.length]);
      for (int i = 0; i < nWords.length; i++) {
        langProfile.getNWords()[i] = Integer.parseInt(nWords[i]);
      }
    }
   
    m = NAME_PATTERN.matcher(storedProfile);
    if (m.find()) {
      langProfile.setName(m.group(1));
    }

    return langProfile;
  }

  public static void writeProfile(LangProfile langProfile, OutputStream output) throws IOException {
    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(output, Charset.forName("utf-8")));
    writer.write("{\"freq\":{");
    boolean first = true;
    for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) {
      if (!first) {
        writer.write(',');
      }
      writer.write('"');
      writer.write(entry.getKey());
      writer.write("\":");
      writer.write(entry.getValue().toString());
      first = false;
    }
    writer.write("},\"n_words\":[");
    first = true;
    for (int nWord : langProfile.getNWords()) {
      if (!first) {
        writer.write(',');
      }
      writer.write(Integer.toString(nWord, 10));
      first = false;
    }
    writer.write("],\"name\":\"");
    writer.write(langProfile.getName());
    writer.write("\"}");
    writer.flush();
  }
}
TOP

Related Classes of be.frma.langguess.LangProfileFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.