Examples of cc.mallet.types.LabelSequence

cc.mallet.types.LabelSequence

    InstanceList ret = new InstanceList(alphabetsPipe);
    Object obj = inst.getData();
    assert(obj instanceof FeatureVectorSequence);


    FeatureVectorSequence fvs = (FeatureVectorSequence) obj;
    LabelSequence ls = (LabelSequence) inst.getTarget();
    assert(fvs.size() == ls.size());


    Object instName = (inst.getName() == null ? "NONAME" : inst.getName());
    
    for (int j = 0; j < fvs.size(); j++) {
      FeatureVector fv = fvs.getFeatureVector(j);
      int[] indices = fv.getIndices();
      FeatureVector data = new AugmentableFeatureVector (alphabetsPipe.getDataAlphabet(),
          indices, fv.getValues(), indices.length); 
      Labeling target = ls.getLabelAtPosition(j);
      String name = instName.toString() + "_@_POS_" + (j + 1);
      Object source = inst.getSource();
      Instance toAdd = alphabetsPipe.pipe(new Instance(data, target, name, source));


      ret.add(toAdd);

View Full Code Here

  public void inferenceAll(int maxIteration){
    this.test = new ArrayList<Topication>();  //initialize test
    //initial sampling on testdata
    ArrayList<LabelSequence> topicSequences = new ArrayList<LabelSequence>();
    for (Instance instance : testing) {
      LabelSequence topicSequence = new LabelSequence(topicAlphabet, new int[instanceLength(instance)]);
      if (false) {
        // This method not yet obeying its last "false" argument, and must be for this to work
        //sampleTopicsForOneDoc((FeatureSequence)instance.getData(), topicSequence, false, false);
      } else {
        Randoms r = new Randoms();
        FeatureSequence fs = (FeatureSequence) instance.getData();
        int[] topics = topicSequence.getFeatures();
        for (int i = 0; i < topics.length; i++) {
          int type = fs.getIndexAtPosition(i);
          topics[i] = r.nextInt(numTopics);
          typeTopicCounts[type].adjustOrPutValue(topics[i], 1, 1);
            tokensPerTopic[topics[i]]++;
        }
      }
      topicSequences.add (topicSequence);
    }


    //construct test
    assert (testing.size() == topicSequences.size());
    for (int i = 0; i < testing.size(); i++) {
      Topication t = new Topication (testing.get(i), this, topicSequences.get(i));
      test.add (t);
    }


    long startTime = System.currentTimeMillis();
    //loop
    int iter = 0;
    for ( ; iter <= maxIteration; iter++) {
      if(iter%100==0)
      {
        System.out.print("Iteration: " + iter);
        System.out.println();
      }
      int numDocs = test.size(); // TODO
      for (int di = 0; di < numDocs; di++) {
        FeatureSequence tokenSequence = (FeatureSequence) test.get(di).instance.getData();
        LabelSequence topicSequence = test.get(di).topicSequence;
        sampleTopicsForOneTestDocAll (tokenSequence, topicSequence);
      }
    }


    long seconds = Math.round((System.currentTimeMillis() - startTime)/1000.0);

View Full Code Here

  public void inference(int maxIteration){
    this.test = new ArrayList<Topication>();  //initialize test
    //initial sampling on testdata
    ArrayList<LabelSequence> topicSequences = new ArrayList<LabelSequence>();
    for (Instance instance : testing) {
      LabelSequence topicSequence = new LabelSequence(topicAlphabet, new int[instanceLength(instance)]);
      if (false) {
        // This method not yet obeying its last "false" argument, and must be for this to work
        //sampleTopicsForOneDoc((FeatureSequence)instance.getData(), topicSequence, false, false);
      } else {
        Randoms r = new Randoms();
        FeatureSequence fs = (FeatureSequence) instance.getData();
        int[] topics = topicSequence.getFeatures();
        for (int i = 0; i < topics.length; i++) {
          int type = fs.getIndexAtPosition(i);
          topics[i] = r.nextInt(numTopics);
        /*  if(typeTopicCounts[type].size() != 0) {
            topics[i] = r.nextInt(numTopics);
          } else {
            topics[i] = -1;  // for unseen words
          }*/
        }
      }
      topicSequences.add (topicSequence);
    }


    //construct test
    assert (testing.size() == topicSequences.size());
    for (int i = 0; i < testing.size(); i++) {
      Topication t = new Topication (testing.get(i), this, topicSequences.get(i));
      test.add (t);
      // Include sufficient statistics for this one doc
      // add count on new data to n[k][w] and n[k][*]
      // pay attention to unseen words
      FeatureSequence tokenSequence = (FeatureSequence) t.instance.getData();
      LabelSequence topicSequence = t.topicSequence;
      for (int pi = 0; pi < topicSequence.getLength(); pi++) {
        int topic = topicSequence.getIndexAtPosition(pi);
        int type = tokenSequence.getIndexAtPosition(pi);
        if(topic != -1) // type seen in training
        {
          typeTopicCounts[type].adjustOrPutValue(topic, 1, 1);
            tokensPerTopic[topic]++;
        }
      }
    }


    long startTime = System.currentTimeMillis();
    //loop
    int iter = 0;
    for ( ; iter <= maxIteration; iter++) {
      if(iter%100==0)
      {
        System.out.print("Iteration: " + iter);
        System.out.println();
      }
      int numDocs = test.size(); // TODO
      for (int di = 0; di < numDocs; di++) {
        FeatureSequence tokenSequence = (FeatureSequence) test.get(di).instance.getData();
        LabelSequence topicSequence = test.get(di).topicSequence;
        sampleTopicsForOneTestDoc (tokenSequence, topicSequence);
      }
    }


    long seconds = Math.round((System.currentTimeMillis() - startTime)/1000.0);

View Full Code Here

  public void inferenceOneByOne(int maxIteration){
    this.test = new ArrayList<Topication>();  //initialize test
    //initial sampling on testdata
    ArrayList<LabelSequence> topicSequences = new ArrayList<LabelSequence>();
    for (Instance instance : testing) {
      LabelSequence topicSequence = new LabelSequence(topicAlphabet, new int[instanceLength(instance)]);
      if (false) {
        // This method not yet obeying its last "false" argument, and must be for this to work
        //sampleTopicsForOneDoc((FeatureSequence)instance.getData(), topicSequence, false, false);
      } else {
        Randoms r = new Randoms();
        FeatureSequence fs = (FeatureSequence) instance.getData();
        int[] topics = topicSequence.getFeatures();
        for (int i = 0; i < topics.length; i++) {
          int type = fs.getIndexAtPosition(i);
          topics[i] = r.nextInt(numTopics);
          typeTopicCounts[type].adjustOrPutValue(topics[i], 1, 1);
          tokensPerTopic[topics[i]]++;
        /*  if(typeTopicCounts[type].size() != 0) {
            topics[i] = r.nextInt(numTopics);
            typeTopicCounts[type].adjustOrPutValue(topics[i], 1, 1);
            tokensPerTopic[topics[i]]++;
          } else {
            topics[i] = -1;  // for unseen words
          }*/
        }
      }
      topicSequences.add (topicSequence);
    }


    //construct test
    assert (testing.size() == topicSequences.size());
    for (int i = 0; i < testing.size(); i++) {
      Topication t = new Topication (testing.get(i), this, topicSequences.get(i));
      test.add (t);
    }


    long startTime = System.currentTimeMillis();
    //loop
    int iter = 0;
    int numDocs = test.size(); // TODO
    for (int di = 0; di < numDocs; di++) {
      iter = 0;
      FeatureSequence tokenSequence = (FeatureSequence) test.get(di).instance.getData();
      LabelSequence topicSequence = test.get(di).topicSequence;
      for( ; iter <= maxIteration; iter++) {
        sampleTopicsForOneTestDoc (tokenSequence, topicSequence);
      }
      if(di%100==0)
      {

View Full Code Here

  public void inferenceWithTheta(int maxIteration, InstanceList theta){
    this.test = new ArrayList<Topication>();  //initialize test
    //initial sampling on testdata
    ArrayList<LabelSequence> topicSequences = new ArrayList<LabelSequence>();
    for (Instance instance : testing) {
      LabelSequence topicSequence = new LabelSequence(topicAlphabet, new int[instanceLength(instance)]);
      if (false) {
        // This method not yet obeying its last "false" argument, and must be for this to work
        //sampleTopicsForOneDoc((FeatureSequence)instance.getData(), topicSequence, false, false);
      } else {
        Randoms r = new Randoms();
        FeatureSequence fs = (FeatureSequence) instance.getData();
        int[] topics = topicSequence.getFeatures();
        for (int i = 0; i < topics.length; i++) {
          int type = fs.getIndexAtPosition(i);
          topics[i] = r.nextInt(numTopics);
        }
      }
      topicSequences.add (topicSequence);
    }


    //construct test
    assert (testing.size() == topicSequences.size());
    for (int i = 0; i < testing.size(); i++) {
      Topication t = new Topication (testing.get(i), this, topicSequences.get(i));
      test.add (t);
      // Include sufficient statistics for this one doc
      // add count on new data to n[k][w] and n[k][*]
      // pay attention to unseen words
      FeatureSequence tokenSequence = (FeatureSequence) t.instance.getData();
      LabelSequence topicSequence = t.topicSequence;
      for (int pi = 0; pi < topicSequence.getLength(); pi++) {
        int topic = topicSequence.getIndexAtPosition(pi);
        int type = tokenSequence.getIndexAtPosition(pi);
        if(topic != -1) // type seen in training
        {
          typeTopicCounts[type].adjustOrPutValue(topic, 1, 1);
            tokensPerTopic[topic]++;
        }
      }
    }


    long startTime = System.currentTimeMillis();
    //loop
    int iter = 0;
    for ( ; iter <= maxIteration; iter++) {
      if(iter%100==0)
      {
        System.out.print("Iteration: " + iter);
        System.out.println();
      }
      int numDocs = test.size(); // TODO
      for (int di = 0; di < numDocs; di++) {
        FeatureVector fvTheta = (FeatureVector) theta.get(di).getData();
        double[] topicDistribution = fvTheta.getValues();
        FeatureSequence tokenSequence = (FeatureSequence) test.get(di).instance.getData();
        LabelSequence topicSequence = test.get(di).topicSequence;
        sampleTopicsForOneDocWithTheta (tokenSequence, topicSequence, topicDistribution);
      }
    }


    long seconds = Math.round((System.currentTimeMillis() - startTime)/1000.0);

View Full Code Here

    PrintWriter pw = new PrintWriter(new FileWriter(f));
    int[] topicCounts = new int[ numTopics ];
    int docLen;
    
    for (int di = 0; di < dataset.size(); di++) {
      LabelSequence topicSequence = dataset.get(di).topicSequence;
      int[] currentDocTopics = topicSequence.getFeatures();
      docLen = currentDocTopics.length;
      for (int token=0; token < docLen; token++) {
        topicCounts[ currentDocTopics[token] ]++;
      }
      pw.println(dataset.get(di).instance.getName());

View Full Code Here

    if (max < 0 || max > numTopics) {
      max = numTopics;
    }


    for (int di = 0; di < dataset.size(); di++) {
      LabelSequence topicSequence = dataset.get(di).topicSequence;
      int[] currentDocTopics = topicSequence.getFeatures();


      pw.print (di); pw.print (' ');


      if (dataset.get(di).instance.getSource() != null) {
        pw.print (dataset.get(di).instance.getSource());

View Full Code Here


    out.println ("#doc source pos typeindex type topic");


    for (int di = 0; di < dataset.size(); di++) {
      FeatureSequence tokenSequence =  (FeatureSequence) dataset.get(di).instance.getData();
      LabelSequence topicSequence =  dataset.get(di).topicSequence;


      String source = "NA";
      if (dataset.get(di).instance.getSource() != null) {
        source = dataset.get(di).instance.getSource().toString();
      }


      for (int pi = 0; pi < topicSequence.getLength(); pi++) {
        int type = tokenSequence.getIndexAtPosition(pi);
        int topic = topicSequence.getIndexAtPosition(pi);
        out.print(di); out.print(' ');
        out.print(source); out.print(' ');
        out.print(pi); out.print(' ');
        out.print(type); out.print(' ');
        out.print(alphabet.lookupObject(type)); out.print(' ');

View Full Code Here

    public Instance pipe (Instance carrier) {


      Object inputData = carrier.getData();
      Alphabet features = getDataAlphabet();
      LabelAlphabet labels;
      LabelSequence target = null;
      String [][] tokens;


      if (inputData instanceof String) {
        tokens = parseSentence((String)inputData);
      }
      else if (inputData instanceof String[][]) {
        tokens = (String[][])inputData;
      }
      else {
        throw new IllegalArgumentException("Not a String or String[][]; got " + inputData);
      }


      FeatureVector[] fvs = new FeatureVector[tokens.length];
      if (isTargetProcessing()) {
        labels = (LabelAlphabet)getTargetAlphabet();
        target = new LabelSequence (labels, tokens.length);
      }


      for (int l = 0; l < tokens.length; l++) {
        int nFeatures;
        if (isTargetProcessing()) {
          if (tokens[l].length < 1) {
            throw new IllegalStateException ("Missing label at line " + l + " instance "+carrier.getName ());
          }
          nFeatures = tokens[l].length - 1;
          target.add(tokens[l][nFeatures]);
        }
        else nFeatures = tokens[l].length;
        ArrayList<Integer> featureIndices = new ArrayList<Integer>();
        for (int f = 0; f < nFeatures; f++) {
          int featureIndex = features.lookupIndex(tokens[l][f]);
          // gdruck
          // If the data alphabet's growth is stopped, featureIndex
          // will be -1.  Ignore these features.
          if (featureIndex >= 0) {
            featureIndices.add(featureIndex);
          }
        }
        int[] featureIndicesArr = new int[featureIndices.size()];
        for (int index = 0; index < featureIndices.size(); index++) {
          featureIndicesArr[index] = featureIndices.get(index);
        }
        fvs[l] = featureInductionOption.value ? new AugmentableFeatureVector(features, featureIndicesArr, null, featureIndicesArr.length) : 
          new FeatureVector(features, featureIndicesArr);
      }
      carrier.setData(new FeatureVectorSequence(fvs));
      if (isTargetProcessing()) {
        carrier.setTarget(target);
      }
      else {
        carrier.setTarget(new LabelSequence(getTargetAlphabet()));
      }
      return carrier;
    }

View Full Code Here

    public Instance pipe(Instance carrier)
    {
      StringTokenization ts =  (StringTokenization) carrier.getData();
      StringTokenization newTs = new StringTokenization((CharSequence) ts.getDocument ());
      final LabelAlphabet dict = (LabelAlphabet) getTargetAlphabet();
      LabelSequence labelSeq = new LabelSequence(dict);
      Label start = dict.lookupLabel ("start");
      Label notstart = dict.lookupLabel ("notstart");


      boolean lastWasSpace = true;
      StringBuffer sb = new StringBuffer();
      for (int i = 0; i < ts.size(); i++) {
        StringSpan t = (StringSpan) ts.getSpan(i);
        if (t.getText().equals(" "))
          lastWasSpace = true;
        else {
          sb.append(t.getText());
          newTs.add(t);
          labelSeq.add(lastWasSpace ? "start" : "notstart");
          lastWasSpace = false;
        }
      }
      if (isTargetProcessing())
        carrier.setTarget(labelSeq);

View Full Code Here

0 1

TOP

Related Classes of cc.mallet.types.LabelSequence

cc.mallet.extract.test.TestDocumentExtraction

cc.mallet.fst.SimpleTagger$SimpleTaggerSentence2FeatureVectorSequence

cc.mallet.fst.SimpleTaggerStdin$SimpleTaggerSentence2FeatureVectorSequence

cc.mallet.fst.tests.TestMEMM$TestMEMMTokenSequenceRemoveSpaces

cc.mallet.pipe.AddClassifierTokenPredictions

cc.mallet.topics.LDAStream

cc.mallet.types.FeatureVectorSequence.Iterator

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.