Package edu.stanford.nlp.wordseg

Source Code of edu.stanford.nlp.wordseg.CorpusChar

package edu.stanford.nlp.wordseg;

import java.util.*;
import java.io.*;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.Generics;

/**
* Check tag of each character from 5 different corpora. (4 official training corpora of Sighan bakeoff 2005, plus CTB)
* These tags are not external knowledge. They are learned from the training corpora.
* @author Huihsin Tseng
* @author Pichuan Chang
*/

public class CorpusChar {
  private Map <String, Set <String>> charMap;

  public CorpusChar(String charlistFilename)  {
    charMap=readDict(charlistFilename);
  }

  Map<String, Set<String>> getCharMap() {
    return charMap;
  }
 
 
  private Map<String, Set <String>> char_dict;

  private Map<String, Set<String>> readDict(String filename)  {

    System.err.println("Loading character dictionary file from " + filename);

    try {
      InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename);     
      BufferedReader DetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
      String DetectorLine;

      char_dict = Generics.newHashMap();
      //System.err.println("DEBUG: in CorpusChar readDict");
      while ((DetectorLine = DetectorReader.readLine()) != null) {
       
        String[] fields = DetectorLine.split("  ");
        String tag=fields[0];

        Set<String> chars=char_dict.get(tag);
  
        if(chars==null){
          chars = Generics.newHashSet();
          char_dict.put(tag,chars);
        }
        //System.err.println("DEBUG: CorpusChar: "+filename+" "+fields[1]);
        chars.add(fields[1]);
   
   
      }
      is.close();
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
    return char_dict;
  }

  public String getTag(String a1, String a2) {
    Map<String, Set<String>> h1=getCharMap();
    Set<String> h2=h1.get(a1);
    if (h2 == null) return "0";
    if (h2.contains(a2))
      return "1";
    return "0";
  }
  /*
  public String getCtbTag(String a1, String a2) {
    HashMap h1=dict.getctb();
    Set h2=(Set)h1.get(a1);
    if (h2 == null) return "0";
    if (h2.contains(a2))
      return "1";
    return "0";
  }

  public String getAsbcTag(String a1, String a2) {
    HashMap h1=dict.getasbc();
    Set h2=(Set)h1.get(a1);
    if (h2 == null) return "0";
    if (h2.contains(a2))
      return "1";
    return "0";
  }
 
  public String getPkuTag(String a1, String a2) {
    HashMap h1=dict.getpku();
    Set h2=(Set)h1.get(a1);
    if (h2 == null) return "0";
    if (h2.contains(a2))
      return "1";
    return "0";
  }

  public String getHkTag(String a1, String a2) {
    HashMap h1=dict.gethk();
    Set h2=(Set)h1.get(a1);
    if (h2 == null) return "0";
    if (h2.contains(a2))
      return "1";
    return "0";
  }


  public String getMsrTag(String a1, String a2) {
    HashMap h1=dict.getmsr();
    Set h2=(Set)h1.get(a1);
    if (h2 == null) return "0";
    if (h2.contains(a2))
      return "1";
    return "0";
    }*/

}//end of class
TOP

Related Classes of edu.stanford.nlp.wordseg.CorpusChar

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.