Examples of LabelAlphabet


Examples of org.fnlp.ml.types.alphabet.LabelAlphabet

    //    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
//        outfile), enc2));
    StopWords sw = new StopWords(stopwordfile);
   
    LabelAlphabet dict = new LabelAlphabet();
    // words in documents
    ArrayList<TIntArrayList> documentsList= new ArrayList<TIntArrayList>();
   
   
    String line = null;
    while ((line = in.readLine()) != null) {
      line = line.trim()
      if(line.length()==0)
        continue;
      String[] toks = line.split("\\s+");
      TIntArrayList wordlist = new TIntArrayList();
      for(int j=0;j<toks.length;j++){
        String tok = toks[j];
        if(sw.isStopWord(tok))
          continue;
        int idx = dict.lookupIndex(tok);
        wordlist.add(idx);
      }
      documentsList.add(wordlist);
    }
    in.close();
    int[][] documents;
    documents = new int[documentsList.size()][];
    for(int i=0;i<documents.length;i++){
      documents[i] = documentsList.get(i).toArray();
    }
        // vocabulary
        int V = dict.size();
        int M = documents.length;
        // # topics
        int K = 4;
        // good values alpha = 2, beta = .5
        float alpha = 2f;
        float beta = .5f;

        System.out.println("Latent Dirichlet Allocation using Gibbs Sampling.");

        LdaGibbsSampler lda = new LdaGibbsSampler(documents, V);
        lda.configure(10000, 2000, 100, 10);
        lda.gibbs(K, alpha, beta);

        float[][] theta = lda.getTheta();
        float[][] phi = lda.getPhi();

        System.out.println();
        System.out.println();
        System.out.println("Document--Topic Associations, Theta[d][k] (alpha="
            + alpha + ")");
        System.out.print("d\\k\t");
        for (int m = 0; m < theta[0].length; m++) {
            System.out.print("   " + m % 10 + "    ");
        }
        System.out.println();
        for (int m = 0; m < theta.length; m++) {
            System.out.print(m + "\t");
            for (int k = 0; k < theta[m].length; k++) {
                // System.out.print(theta[m][k] + " ");
                System.out.print(shadefloat(theta[m][k], 1) + " ");
            }
            System.out.println();
        }
        System.out.println();
        System.out.println("Topic--Term Associations, Phi[k][w] (beta=" + beta
            + ")");

        System.out.print("k\\w\t");
        for (int w = 0; w < phi[0].length; w++) {
            System.out.print("   " + dict.lookupString(w) + "    ");
        }
        System.out.println();
        for (int k = 0; k < phi.length; k++) {
            System.out.print(k + "\t");
            for (int w = 0; w < phi[k].length; w++) {
              System.out.print(lnf.format(phi[k][w]) + " ");
//              System.out.print(phi[k][w] + " ");
//                System.out.print(shadefloat(phi[k][w], 1) + " ");
            }
            System.out.println();
        }
        for (int k = 0; k < phi.length; k++) {
          int[] top = MyArrays.sort(phi[k]);
         
            for (int w = 0; w < 10; w++) {
              System.out.print(dict.lookupString(top[w]) + " ");
            }
            System.out.println();
        }
    }
View Full Code Here

Examples of org.fnlp.ml.types.alphabet.LabelAlphabet

    }

    public KMeansWordCluster(String alphabetPath, String classCenterPath, String templatePath, String classPath) throws Exception {
        readTemplete(templatePath);
        readClass(classPath);
        LabelAlphabet alphabetRead = (LabelAlphabet)loadObject(alphabetPath);
        @SuppressWarnings("unchecked")
        ArrayList<HashSparseVector> classCenterRead = (ArrayList<HashSparseVector>)loadObject(classCenterPath);
        setAlphabet(alphabetRead);
        setClassCenter(classCenterRead);
        addClassCount();
View Full Code Here

Examples of org.fnlp.ml.types.alphabet.LabelAlphabet

    //TODO: 修改字典类型
    AlphabetFactory.defaultFeatureType = Type.String;
    /**
     * 标签转为0、1、2、...
     */
    LabelAlphabet labels = factory.DefaultLabelAlphabet();

    // 将样本通过Pipe抽取特征
    IFeatureAlphabet features = factory.DefaultFeatureAlphabet();
    featurePipe = new Sequence2FeatureSequence(templets, features, labels);

View Full Code Here

Examples of org.fnlp.ml.types.alphabet.LabelAlphabet

    addEnTag(cl,file);
    cl.saveTo(file);
  }

  private void addEnTag(AbstractTagger cl, String file) throws IOException {
    LabelAlphabet label = cl.factory.DefaultLabelAlphabet()
    HashMap<String, String> map = MyCollection.loadStringStringMap(c2ePath);
    cl.factory.remove("label-en");
    LabelAlphabet enLabel = cl.factory.buildLabelAlphabet("label-en");
    enLabel.clear();
    enLabel.setStopIncrement(false);
    for(int i=0;i<label.size();i++){
      String cn = label.lookupString(i);
      String en = map.get(cn);     
      if(en==null)
        System.out.println("POSTag Not Found: "+cn);
      int id = enLabel.lookupIndex(en);     
    }
    enLabel.setStopIncrement(true);
  }
View Full Code Here

Examples of org.fnlp.ml.types.alphabet.LabelAlphabet

      factory = AlphabetFactory.buildFactory();

    /**
     * 标签转为0、1、2、...
     */
    LabelAlphabet labels = factory.DefaultLabelAlphabet();

    // 将样本通过Pipe抽取特征
    // 这里用的重建特征,而Label不需要重建
    // 测试时不需要重建特征
    IFeatureAlphabet features = null;
View Full Code Here

Examples of org.fnlp.ml.types.alphabet.LabelAlphabet

    long beginTime = System.currentTimeMillis();

    Pipe pipe = createProcessor(false);
    InstanceSet trainSet = new InstanceSet(pipe, factory);

    LabelAlphabet labels = factory.DefaultLabelAlphabet();
    IFeatureAlphabet features = factory.DefaultFeatureAlphabet();

    // 训练集
    trainSet.loadThruStagePipes(new SequenceReader(train,true, "utf8"));

    long endTime = System.currentTimeMillis();
    System.out.println(" done!");
    System.out
    .println("Time escape: " + (endTime - beginTime) / 1000 + "s");
    System.out.println();

    // 输出
    System.out.println("Training Number: " + trainSet.size());

    System.out.println("Label Number: " + labels.size()); // 标签个数
    System.out.println("Feature Number: " + features.size()); // 特征个数

    // 冻结特征集
    features.setStopIncrement(true);
    labels.setStopIncrement(true);

    InstanceSet testSet = null;
    // /////////////////
    if (testfile != null) {

      Pipe tpipe;
      if (false) {// 如果test data没有标注
        tpipe = new SeriesPipes(new Pipe[] { featurePipe });
      } else {
        tpipe = pipe;
      }

      // 测试集
      testSet = new InstanceSet(tpipe);

      testSet.loadThruStagePipes(new SequenceReader(testfile, true, "utf8"));
      System.out.println("Test Number: " + testSet.size()); // 样本个数
    }

    /**
     *
     * 更新参数的准则
     */
    Update update;
    // viterbi解码
    Inferencer inference;
    boolean standard = true;
    HammingLoss loss = new HammingLoss();
    if (standard) {
      inference = new LinearViterbi(templets, labels.size());
      update = new LinearViterbiPAUpdate((LinearViterbi) inference, loss);
    } else {
      inference = new HigherOrderViterbi(templets, labels.size());
      update = new HigherOrderViterbiPAUpdate(templets, labels.size(), true);
    }

    OnlineTrainer trainer = new OnlineTrainer(inference, update, loss,
        features.size(), iterNum, c1);
   
View Full Code Here

Examples of org.fnlp.ml.types.alphabet.LabelAlphabet

    int ENG_all = 0, ENG_right = 0;
    Loss loss = new HammingLoss();

    String[][] labelsSet = new String[testSet.size()][];
    String[][] targetSet = new String[testSet.size()][];
    LabelAlphabet labels = cl.getAlphabetFactory().buildLabelAlphabet(
        "labels");
    for (int i = 0; i < testSet.size(); i++) {
      Instance carrier = testSet.get(i);
      int[] pred = (int[]) cl.classify(carrier).getLabel(0);
      if (acc) {
        len += pred.length;
        double e = loss.calc(carrier.getTarget(), pred);
        error += e;
        if(e != 0)
          senError++;
        //测试中英混杂语料
        if(hasENG) {
          String[][] origin = (String[][])carrier.getSource();
          int[] target = (int[])carrier.getTarget();
          for(int j = 0; j < target.length; j++) {
            if(origin[j][0].contains("ENG")) {
              ENG_all++;
              if(target[j] == pred[j])
                ENG_right++;
            }
          }
        }
      }
      labelsSet[i] = labels.lookupString(pred);
      targetSet[i] = labels.lookupString((int[])carrier.getTarget());
    }

    long endtime = System.currentTimeMillis();
    System.out.println("totaltime\t" + (endtime - starttime) / 1000.0);
    System.out.println("feature\t" + (featuretime - starttime) / 1000.0);
View Full Code Here

Examples of org.fnlp.ml.types.alphabet.LabelAlphabet

    int ENG_all = 0, ENG_right = 0;
    Loss loss = new HammingLoss();

    String[][] labelsSet = new String[testSet.size()][];
    String[][] targetSet = new String[testSet.size()][];
    LabelAlphabet labels = cl.getAlphabetFactory().buildLabelAlphabet(
        "labels");
    for (int i = 0; i < testSet.size(); i++) {
      Instance carrier = testSet.get(i);
      int[] pred = (int[]) cl.classify(carrier).getLabel(0);
      if (acc) {
        len += pred.length;
        double e = loss.calc(carrier.getTarget(), pred);
        error += e;
        if(e != 0)
          senError++;
        //测试中英混杂语料
        if(hasENG) {
          String[][] origin = (String[][])carrier.getSource();
          int[] target = (int[])carrier.getTarget();
          for(int j = 0; j < target.length; j++) {
            if(origin[j][0].contains("ENG")) {
              ENG_all++;
              if(target[j] == pred[j])
                ENG_right++;
            }
          }
        }
      }
      labelsSet[i] = labels.lookupString(pred);
      targetSet[i] = labels.lookupString((int[])carrier.getTarget());
    }

    long endtime = System.currentTimeMillis();
    System.out.println("totaltime\t" + (endtime - starttime) / 1000.0);
    System.out.println("feature\t" + (featuretime - starttime) / 1000.0);
View Full Code Here

Examples of org.fnlp.ml.types.alphabet.LabelAlphabet

    CoNLLReader reader = new CoNLLReader(file);

    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
        new FileOutputStream(fp), charset));
   
    LabelAlphabet postagAlphabet = factory.buildLabelAlphabet("postag");

    int count = 0;
    while (reader.hasNext()) {

      Sentence instance = (Sentence) reader.next();
      int[] heads = (int[]) instance.getTarget();
      ParsingState state = new ParsingState(instance,factory);
      while (!state.isFinalState()) {
        // 左右焦点词在句子中的位置
        int[] lr = state.getFocusIndices();

        HashSparseVector features = state.getFeatures();
        ParsingState.Action action = getAction(lr[0], lr[1],
            heads);
        state.next(action);
        if (action == ParsingState.Action.LEFT)
          heads[lr[1]] = -1;
        if (action == ParsingState.Action.RIGHT)
          heads[lr[0]] = -1;

        // writer.write(String.valueOf(instance.postags[lr[0]]));
        String pos = instance.getTagAt(lr[0]);
        postagAlphabet.lookupIndex(pos);
        writer.write(pos);
        writer.write(" ");
        switch (action) {
        case LEFT:
          writer.write("L");
View Full Code Here

Examples of org.fnlp.ml.types.alphabet.LabelAlphabet

    fp = File.createTempFile("train-features", null, new File("./tmp/"));

    buildInstanceList(dataFile);

    LabelAlphabet postagAlphabet = factory.buildLabelAlphabet("postag");

    IFeatureAlphabet features = factory.DefaultFeatureAlphabet();

    SFGenerator generator = new SFGenerator();
    Linear[] models = new Linear[postagAlphabet.size()];
    int fsize = features.size();

    for (int i = 0; i < postagAlphabet.size(); i++) {
      String pos = postagAlphabet.lookupString(i);
      InstanceSet instset = readInstanceSet(pos);
      LabelAlphabet alphabet = factory.buildLabelAlphabet(pos);
      int ysize = alphabet.size();
      System.out.printf("Training with data: %s\n", pos);
      System.out.printf("Number of labels: %d\n", ysize);
      LinearMax solver = new LinearMax(generator, ysize);
      ZeroOneLoss loss = new ZeroOneLoss();
      Update update = new LinearMaxPAUpdate(loss);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.