Package net.paoding.analysis.analyzer.estimate

Source Code of net.paoding.analysis.analyzer.estimate.TryPaodingAnalyzer

package net.paoding.analysis.analyzer.estimate;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;

import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.knife.PaodingMaker;

import org.apache.lucene.analysis.Analyzer;

public class TryPaodingAnalyzer {
  private static final String ARGS_TIP = ":";
  static String input = null;
  static String file = null;
  static Reader reader = null;
  static String charset = null;
  static String mode = null;
  static String analyzerName = null;
  static String print = null;
  static String properties = PaodingMaker.DEFAULT_PROPERTIES_PATH;
 
  public static void main(String[] args) {
    try {
      resetArgs();
     
      int inInput = 0;
      for (int i = 0; i < args.length; i++) {
        if (args[i] == null || (args[i] = args[i].trim()).length() == 0) {
          continue;
        }
        if (args[i].equals("--file") || args[i].equals("-f")) {
          file = args[++i];
        } else if (args[i].equals("--charset") || args[i].equals("-c")) {
          charset = args[++i];
        } else if (args[i].equals("--mode") || args[i].equals("-m")) {
          mode = args[++i];
        } else if (args[i].equals("--properties") || args[i].equals("-p")) {
          properties = args[++i];
        } else if (args[i].equals("--analyzer") || args[i].equals("-a")) {
          analyzerName = args[++i];
        } else if (args[i].equals("--print") || args[i].equals("-P")) {
          print = args[++i];
        } else if (args[i].equals("--input") || args[i].equals("-i")) {
          inInput++;
        } else if (args[i].equals("--help") || args[i].equals("-h")
            || args[i].equals("?")) {
          printHelp();
          return;
        } else {
          // 非选项的参数数组视为input
          if (!args[i].startsWith("-")
              && (i == 0 || args[i - 1].equals("-i") || args[i - 1].equals("--input") || !args[i - 1].startsWith("-"))) {
            if (input == null) {
              input = args[i];// !!没有++i
            } else {
              input = input + ' ' + args[i];// !!没有++i
            }
            inInput++;
          }
        }
      }
      if (file != null) {
        input = null;
        reader = getReader(file, charset);
      }
      //
      analysing();
    } catch (Exception e1) {
      resetArgs();
      e1.printStackTrace();
    }
  }



  private static void resetArgs() {
    input = null;
    file = null;
    reader = null;
    charset = null;
    mode = null;
    print = null;
    analyzerName = null;
    properties = PaodingMaker.DEFAULT_PROPERTIES_PATH;
  }
 

 
  private static void analysing() throws Exception {
    Analyzer analyzer;
    if (analyzerName == null || analyzerName.length() == 0 || analyzerName.equalsIgnoreCase("paoding")) {
      //properties==null等同于new new PaodingAnalyzer();
      analyzer = new PaodingAnalyzer(properties);
      if (mode != null) {
        ((PaodingAnalyzer) analyzer).setMode(mode);
      }
    }
    else {
      Class clz;
      if (analyzerName.equalsIgnoreCase("standard")) {
        analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer";
      }
      else if (analyzerName.equalsIgnoreCase("cjk")) {
        analyzerName = "org.apache.lucene.analysis.cjk.CJKAnalyzer";
      }
      else if (analyzerName.equalsIgnoreCase("cn") || analyzerName.equalsIgnoreCase("chinese")) {
        analyzerName = "org.apache.lucene.analysis.cn.ChineseAnalyzer";
      }
      else if (analyzerName.equalsIgnoreCase("st") || analyzerName.equalsIgnoreCase("standard")) {
        analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer";
      }
      clz = Class.forName(analyzerName);
      analyzer = (Analyzer) clz.newInstance();
    }
    boolean readInputFromConsle = false;
    Estimate estimate = new Estimate(analyzer);
    if (print != null) {
      estimate.setPrint(print);
    }
    while (true) {
      if (reader == null) {
        if (input == null || input.length() == 0 || readInputFromConsle) {
          input = getInputFromConsole();
          readInputFromConsle = true;
        }
        if (input == null || input.length() == 0) {
          System.out.println("Warn: none charactors you input!!");
          continue;
        }
        else if (input.startsWith(ARGS_TIP)) {
          String argsStr = input.substring(ARGS_TIP.length());
          main(argsStr.split(" "));
          continue;
        }
      }
      if (reader != null) {
        estimate.test(System.out, reader);
        reader = null;
      }
      else {
        estimate.test(System.out, input);
        input = null;
      }
      System.out.println("--------------------------------------------------");
      if (false == readInputFromConsle) {
        return;
      }
    }
  }

  private static String getInputFromConsole() throws IOException {
    printTitleIfNotPrinted("");
    String input = null;
    BufferedReader reader = new BufferedReader(new InputStreamReader(
        System.in));
    String line;
    do {
      System.out.print("paoding> ");
      line = reader.readLine();
      if (line == null || line.length() == 0) {
        continue;
      }
      if (line.equals(ARGS_TIP + "clear") || line.equals(ARGS_TIP + "c")) {
        input = null;
        System.out.println("paoding> Cleared");
        return getInputFromConsole();
      }
      else if (line.equals(ARGS_TIP + "exit") || line.equals(ARGS_TIP + "quit") || line.equals(ARGS_TIP + "e") || line.equals(ARGS_TIP + "q") ) {
        System.out.println("Bye!");
        System.exit(0);
      }
      else if (input == null && line.startsWith(ARGS_TIP)) {
        input = line;
        break;
      }
      else {
        if (line.endsWith(";")) {
          if (line.length() > ";".length()) {
            input = line.substring(0, line.length() - ";".length());
          }
          break;
        }
        else {
          if (input == null) {
            input = line;
          } else {
            input = input + "\n" + line;
          }
        }
      }
    } while (true);
    return input == null ? null : input.trim();
  }

  private static void printHelp() {
    String app = System.getProperty("paoding.try.app",
        "TryPaodingAnalyzer");
    String cmd = System.getProperty("paoding.try.cmd", "java "
        + TryPaodingAnalyzer.class.getName());
    System.out.println(app + "的用法:");
    System.out.println("\t" + cmd + " [OPTIONS] [text_content]");
    System.out.println("\nOPTIONS:");
    System.out.println("\t--file, -f:\n\t\t文章以文件的形式输入,在前缀加上\"classpath:\"表示从类路径中寻找该文件。");
    System.out.println("\t--charset, -c:\n\t\t文章的字符集编码,比如gbk,utf-8等。如果没有设置该选项,则使用Java环境默认的字符集编码。");
    System.out.println("\t--properties, -p:\n\t\t不读取默认的类路径下的庖丁分词属性文件,而使用指定的文件,在前缀加上\"classpath:\"表示从类路径中寻找该文件。");
    System.out.println("\t--mode, -m:\n\t\t强制使用给定的mode的分词器;可以设定为default,most-words,max-word-length或指定类名的其他mode(指定类名的,需要加前缀\"class:\")。");
    System.out.println("\t--input, -i:\n\t\t要被分词的文章内容;当没有通过-f或--file指定文章输入文件时可选择这个选项指定要被分词的内容。");
    System.out.println("\t--analyzer, -a:\n\t\t测试其他分词器,通过--analyzer或-a指定其完整类名。特别地,paoding、cjk、chinese、st分别对应PaodingAnalyzer、CJKAnalyzer、ChineseAnalyzer、StandardAnalyzer");
    System.out.println("\t--print, -P:\n\t\t 是否打印分词结果。默认打印前50行。规则:no表示不打印;50等价于1-50行;1-50表示打印1至50行;可以以逗号组合使用,如20,40-50表示打印1-20以及40-50行");
    System.out.println("\n示例:");
    System.out.println("\t" + cmd);
    System.out.println("\t" + cmd + " ?");
    System.out.println("\t" + cmd + " 中华人民共和国");
    System.out.println("\t" + cmd + " -m max 中华人民共和国");
    System.out.println("\t" + cmd + " -f e:/content.txt -c utf8");
    System.out.println("\t" + cmd + " -f e:/content.txt -c utf8 -m max-word-length");
    System.out.println("\t" + cmd + " -f e:/content.txt -c utf8 -a cjk");
    System.out.println("\n若是控制台进入\"paoding>\"后:");
    titlePrinted = false;
    printTitleIfNotPrinted("\t");
  }
 
 
  private static boolean titlePrinted = false;
  private static boolean welcomePrinted = false;
  private static void printTitleIfNotPrinted(String prefix) {
    if (!titlePrinted) {
      System.out.println();
      if (!welcomePrinted) {
        System.out.println("Welcome to Paoding Analyser(2.0.4-alpha2)");
        System.out.println();
        welcomePrinted = true;
      }
      System.out.println(prefix + "直接输入或粘贴要被分词的内容,以分号;结束,回车后开始分词。");
      System.out.println(prefix + "另起一行输入:clear或:c,使此次输入无效,用以重新输入。");
      System.out.println(prefix + "要使用命令行参数读入文件内容或其他参数请以冒号:开始,然后输入参数选项。");
      System.out.println(prefix + "退出,请输入:quit或:q、:exit、:e");
      System.out.println(prefix + "需要帮助,请输入:?");
      System.out.println(prefix + "注意:指定对文件分词之前要了解该文件的编码,如果系统编码和文件编码不一致,要通过-c指定文件的编码。");
      System.out.println();
      titlePrinted = true;
    }
  }
 
   
  static String getContent(String path, String encoding) throws IOException {
    return (String) read(path, encoding, true);
  }
 
  static Reader getReader(String path, String encoding) throws IOException {
    return (Reader) read(path, encoding, false);
  }
 
  static Object read(String path, String encoding, boolean return_string) throws IOException {
    InputStream in;
    if (path.startsWith("classpath:")) {
      path = path.substring("classpath:".length());
      URL url = Estimate.class.getClassLoader().getResource(path);
      if (url == null) {
        throw new IllegalArgumentException("Not found " + path
            + " in classpath.");
      }
      System.out.println("read content from:" + url.getFile());
      in = url.openStream();
    } else {
      File f = new File(path);
      if (!f.exists()) {
        throw new IllegalArgumentException("Not found " + path
            + " in system.");
      }
      System.out.println("read content from:" + f.getAbsolutePath());
      in = new FileInputStream(f);
    }
    Reader re;
    if (encoding != null) {
      re = new InputStreamReader(in, encoding);
    } else {
      re = new InputStreamReader(in);
    }
    if (!return_string) {
      return re;
    }
    char[] chs = new char[1024];
    int count;
    // 为兼容低版本的JDK,使用StringBuffer而不是StringBuilder
    StringBuffer content = new StringBuffer();
    while ((count = re.read(chs)) != -1) {
      content.append(chs, 0, count);
    }
    re.close();
    return content.toString();
    }
}
TOP

Related Classes of net.paoding.analysis.analyzer.estimate.TryPaodingAnalyzer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.