Package syntaxLearner

Source Code of syntaxLearner.Recorder

package syntaxLearner;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import syntaxLearner.corpus.Corpus;
import syntaxLearner.corpus.Vocabulary;
import syntaxLearner.corpus.Word;
/**
* A library that records the algorithm in JSON for a standard interpreter
* @author Omer Shapira
*
*/
public class Recorder {
  private File mainPath;
  private File currentPath;
  private File wordPath;
  private String name;
  private boolean isRecording;
  private int iterationCounter = 0;
  private File corpusFile;
  private Learner l;
  private File iterationFile;
  private Corpus c;

  public Recorder(Learner l, File mainPath,  String name, boolean isRecording){
    this.mainPath = mainPath;
    this.name = name;
    this.isRecording = isRecording;
    this.l=l;
  }
  /*
   *
   * corpus_%corpusName% = {
    name: "...",
    tokenCount: 500,
    typeCount: 1000,
    commonTypes: 900,
    clusterCount: 100,
    identityEps: 0.00004
    iterationCount: 7,
    typeToId:
    {
      typeName:0,
      typeName2: 1,
      ...
    },
    idToType: {
    1: "word",
    4: "bleh",
    }
  };

   */
  /**
   *
   */
  public void recordCorpusData (Corpus c, Learner l){
    if (!isRecording) return;
    this.c=c;
    Vocabulary v = c.getVocabulary();
    Set<Map.Entry<String, Integer>> entrySet = v.getWordIndicesEntrySet();
    Set<Map.Entry<Integer, Word>> wordSet = v.getWordEntrySet();
    StringBuilder s = new StringBuilder(v.getNumOfWords()*30);
    corpusFile = new File(mainPath, "corpus_"+name+".js");

    s.append(String.format("corpus_%1$s = \n{\n\tname:\"%1$s\",\n", name));
    s.append(String.format("\ttokenCount: %1$s,\n", c.tokenCount));
    s.append(String.format("\ttypeCount: %1$s,\n", v.getNumOfWords()));
    s.append(String.format("\tcommonTypes: %1$s,\n", (v.getNumOfWords()-v.countWordsBelowThreshold(l.RARE_WORD_THRESHOLD))));
    s.append(String.format("\tclusterCount: %1$s,\n", l.NUMBER_OF_CLUSTERS));
    //TODO add proper number formatters
    s.append(String.format("\tidentityEps: %1$s,\n", l.IDENTITY_EPSILON));
    s.append(String.format("\titerationCount: %1$s,\n", iterationCounter));
    s.append("\ttypeToId: \n\t{\n\t");
    for (Map.Entry<String, Integer> e: entrySet){
      if (v.getWord(e.getValue()).frequency >= l.RARE_WORD_THRESHOLD){ //TODO see if necessary
        s.append(String.format("\t\t\"%1$s\": %2$s,\n", e.getKey(), e.getValue()));
      }
    }
    s.append("},\n");
    s.append("\tidToType: {");
    for (Map.Entry<Integer, Word> e: wordSet){ //No rare word check here, to keep vector status
      if (e.getValue().frequency >= l.RARE_WORD_THRESHOLD){
        s.append(String.format("\t\t%1$s: \"%2$s\",\n",e.getKey(), e.getValue().name))
      }
    }
    s.append("}");
    s.append("\n};");

    try {
      corpusFile.createNewFile();
      BufferedWriter out = new BufferedWriter(new FileWriter(corpusFile));
      out.append(s);
      out.close();
    } catch (IOException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }
  }

  private void updateIterationData(){
    if (iterationCounter<l.getIterationCount()){
      iterationCounter = l.getIterationCount();
      //TODO Write iteration number to file   
    }
  }
  /*
   * iteration_%corpusName%_%iterationNumber% = {
  iteration_number : 2,
  unsorted_words: ["a", "b", ...]
};

   */
  /**
   *
   */
  public void recordNewIteration(String words){
    if (!isRecording) return;
    updateIterationData();
    currentPath = new File(mainPath, Integer.toString(iterationCounter));
    currentPath.mkdir();
    wordPath = new File(currentPath, "words");
    wordPath.mkdir();
    iterationFile = new File(currentPath,
        String.format("iteration_%1$s_%2$s.js",name,iterationCounter));
    StringBuilder s = new StringBuilder();
    s.append(String.format("iteration_%1$s_%2$s = {\n",name,iterationCounter));
    s.append(String.format("\titeration_number : %1$s,\n",iterationCounter));
    s.append("\tunsorted_words: ");
    s.append(words);
    s.append("\n}");
    try {
      iterationFile.createNewFile();
      BufferedWriter out = new BufferedWriter(new FileWriter(iterationFile));
      out.append(s);
      out.close();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }


  /*
   * cluster_%corpusName%_%iterationNumber%_%clusterIndex% =
{
  index: 30,
  words: [8, 9, ...],
  distribution: [[0.1, 0.1,...],
           [0.1, 0.1,...],
           [0.1, 0.1,....],
           ..]
}

   */
  /**
   *
   */
  public void recordClusterInfo(Cluster c){
    if (!isRecording) return;
    StringBuilder s = new StringBuilder();
    String filename = String.format("cluster_%1$s_%2$s_%3$s", name, iterationCounter,c.ID);
    s.append(filename+" =\n{\n");
    s.append(String.format("\tindex: %1$s,\n", c.ID));
    s.append("\twords: [");
    for (int i: c.words){
      s.append(" "+i+",");
    }
    //remove last comma
    s.deleteCharAt(s.length()-1);
    s.append(" ],\n");
    s.append("\tdistribution: [");
    for (double d1[] : c.clusterDistribution()){
      for (double d2: d1){
        s.append(String.format(" %1$-4f,", d2));
      }
    }
    s.deleteCharAt(s.length()-1);
    s.append(" ]\n}");
    File f = new File (currentPath, filename+".js");
    try {
      f.createNewFile();
      BufferedWriter out = new BufferedWriter(new FileWriter(f));
      out.append(s);
      out.close();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }

  /*
   * word_%corpusName%_%iterationNumber%_%wordIndex% =
{
  word: "..",
  wordIndex: 5,
  freq: 80,
  rank: 150,
  distribution: [[0.1, 0.1,...],
           [0.1, 0.1,...],
           [0.1, 0.1,....],
           ..],
  dist_to_cluster: []
}

   */
  public void recordWordInfo(Word w, TreeMap<Double, Short> distances, StringBuilder distribution){
    if (!isRecording) return;
    double[] distanceArray = new double[l.NUMBER_OF_CLUSTERS];
    for (Map.Entry<Double, Short> e : distances.entrySet()){
      assert (e.getValue()<distanceArray.length);
      distanceArray[e.getValue()]=e.getKey();
    }
    String filename = String.format("word_%1$s_%2$s_%3$s", name, iterationCounter, w.ID);
    StringBuilder s = new StringBuilder();
    s.append(filename+" =\n{\n");
    s.append(String.format("\tword: \"%1$s\",\n\twordIndex: %2$s,\n", w.name, w.ID));
    s.append(String.format("\tfreq: %1$s,\n", w.frequency));
    s.append(String.format("\trank: %1$s,\n", c.getVocabulary().getRank(w.ID)));
    s.append("\tdistribution: ");
    s.append(distribution);
    s.append(",\n\tdist_to_cluster : [");
    for (double d: distanceArray){
      s.append(String.format(" %1$-4f,", d));
    }
    s.deleteCharAt(s.length()-1);
    s.append(" ]\n}");
    File f = new File(wordPath, filename+".js");
    try {
      f.createNewFile();
      BufferedWriter out = new BufferedWriter(new FileWriter(f));
      out.append(s);
      out.close();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
TOP

Related Classes of syntaxLearner.Recorder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.