Examples of cc.mallet.types.InstanceList

cc.mallet.types.InstanceList
A list of machine learning instances, typically used for training or testing of a machine learning algorithm.
All of the instances in the list will have been passed through the same {@link cc.mallet.pipe.Pipe}, and thus must also share the same data and target Alphabets. InstanceList keeps a reference to the pipe and the two alphabets.
The most common way of adding instances to an InstanceList is through the add(PipeInputIterator) method. PipeInputIterators are a way of mapping general data sources into instances suitable for processing through a pipe. As each {@link cc.mallet.types.Instance} is pulled from the PipeInputIterator, the InstanceListcopies the instance and runs the copy through its pipe (with resultant destructive modifications) before saving the modified instance on its list. This is the usual way in which instances are transformed by pipes.
InstanceList also contains methods for randomly generating lists of feature vectors; splitting lists into non-overlapping subsets (useful for test/train splits), and iterators for cross validation. @see Instance @see Pipe @author Andrew McCallum mccallum@cs.umass.edu


  // Tests that setWeightsDimensionDensely respects featureSelections
  public void testDenseFeatureSelection() {
    Pipe p = makeSpacePredictionPipe();


    InstanceList instances = new InstanceList(p);
    instances.addThruPipe(new ArrayIterator(data));


    // Test that dense observations wights aren't added for
    // "default-feature" edges.
    CRF crf1 = new CRF(p, null);
    crf1.addOrderNStates(instances, new int[] { 0 }, null, "start", null,

View Full Code Here

  }


  public void testXis() {
    Pipe p = makeSpacePredictionPipe();


    InstanceList instances = new InstanceList(p);
    instances.addThruPipe(new ArrayIterator(data));


    CRF crf1 = new CRF(p, null);
    crf1.addFullyConnectedStatesForLabels();
    CRFTrainerByLabelLikelihood crft1 = new CRFTrainerByLabelLikelihood(
        crf1);
    crft1.train(instances, 10); // Let's get some parameters


    Instance inst = instances.get(0);
    Sequence input = (Sequence) inst.getData();
    SumLatticeDefault lattice = new SumLatticeDefault(crf1, input,
        (Sequence) inst.getTarget(), null, true);
    for (int ip = 0; ip < lattice.length() - 1; ip++) {
      for (int i = 0; i < crf1.numStates(); i++) {

View Full Code Here

    double deltaPoints = (double) instances.size();
    int iterations = 0;
    SparseVector clusterMean;


    for (int c = 0; c < numClusters; c++) {
      instanceClusters.add(c, new InstanceList(instancePipe));
    }


    logger.info("Entering KMeans iteration");


    while (deltaMeans > MEANS_TOLERANCE && iterations < MAX_ITER
        && deltaPoints > instances.size() * POINTS_TOLERANCE) {


      iterations++;
      deltaPoints = 0;


      // For each instance, measure its distance to the current cluster
      // means, and subsequently assign it to the closest cluster
      // by adding it to an corresponding instance list
      // The mean of each cluster InstanceList is then updated.
      for (int n = 0; n < instances.size(); n++) {


        instClust = 0;
        instClustDist = Double.MAX_VALUE;


        for (int c = 0; c < numClusters; c++) {
          instDist = metric.distance(clusterMeans.get(c),
              (SparseVector) instances.get(n).getData());


          if (instDist < instClustDist) {
            instClust = c;
            instClustDist = instDist;
          }
        }
        // Add to closest cluster & label it such
        instanceClusters.get(instClust).add(instances.get(n));


        if (clusterLabels[n] != instClust) {
          clusterLabels[n] = instClust;
          deltaPoints++;
        }


      }


      deltaMeans = 0;


      for (int c = 0; c < numClusters; c++) {


        if (instanceClusters.get(c).size() > 0) {
          clusterMean = VectorStats.mean(instanceClusters.get(c));


          deltaMeans += metric.distance(clusterMeans.get(c), clusterMean);


          clusterMeans.set(c, clusterMean);


          instanceClusters.set(c, new InstanceList(instancePipe));


        } else {


          logger.info("Empty cluster found.");


          switch (emptyAction) {
            case EMPTY_ERROR:
              return null;
            case EMPTY_DROP:
              logger.fine("Removing cluster " + c);
              clusterMeans.remove(c);
              instanceClusters.remove(c);
              for (int n = 0; n < instances.size(); n++) {


                assert (clusterLabels[n] != c) : "Cluster size is "
                    + instanceClusters.get(c).size()
                    + "+ yet clusterLabels[n] is " + clusterLabels[n];


                if (clusterLabels[n] > c)
                  clusterLabels[n]--;
              }


              numClusters--;
              c--; // <-- note this trickiness. bad style? maybe.
              // it just means now that we've deleted the entry,
              // we have to repeat the index to get the next entry.
              break;


            case EMPTY_SINGLE:


              // Get the instance the furthest from any centroid
              // and make it a new centroid.


              double newCentroidDist = 0;
              int newCentroid = 0;
              InstanceList cacheList = null;


              for (int clusters = 0; clusters < clusterMeans.size(); clusters++) {
                SparseVector centroid = clusterMeans.get(clusters);
                InstanceList centInstances = instanceClusters.get(clusters);


                // Dont't create new empty clusters.


                if (centInstances.size() <= 1)
                  continue;
                for (int n = 0; n < centInstances.size(); n++) {
                  double currentDist = metric.distance(centroid,
                      (SparseVector) centInstances.get(n).getData());
                  if (currentDist > newCentroidDist) {
                    newCentroid = n;
                    newCentroidDist = currentDist;
                    cacheList = centInstances;

View Full Code Here

          MEMM.State source = (MEMM.State) ti.getSourceState();
          if (count != 0) {
            // Create the source state's trainingSet if it doesn't exist yet.
            if (source.trainingSet == null)
              // New InstanceList with a null pipe, because it doesn't do any processing of input.
              source.trainingSet = new InstanceList (null);
            // TODO We should make sure we don't add duplicates (through a second call to setWeightsDimenstion..!
            // TODO Note that when the training data still allows ambiguous outgoing transitions
            // this will add the same FV more than once to the source state's trainingSet, each
            // with >1.0 weight.  Not incorrect, but inefficient.
//            System.out.println ("From: "+source.getName()+" ---> "+getOutput()+" : "+getInput());

View Full Code Here


  public void printInstanceLists ()
  {
    for (int i = 0; i < memm.numStates(); i++) {
      State state = (State) memm.getState (i);
      InstanceList training = state.trainingSet;
      System.out.println ("State "+i+" : "+state.getName());
      if (training == null) {
        System.out.println ("No data");
        continue;
      }
      for (int j = 0; j < training.size(); j++) {
        Instance inst = training.get (j);
        System.out.println ("From : "+state.getName()+" To : "+inst.getTarget());
        System.out.println ("Instance "+j);
        System.out.println (inst.getTarget());
        System.out.println (inst.getData());
      }

View Full Code Here

  
  public InstanceList getInstances () { return this.instances; }


  /** Return an list of instances with a particular label. */
  public InstanceList getCluster(int label) {    
    InstanceList cluster = new InstanceList(instances.getPipe());    
    for (int n=0 ; n<instances.size() ; n++) 
      if (labels[n] == label)
        cluster.add(instances.get(n));      
    return cluster;
  }

View Full Code Here

  @Override
  public double[] getEvaluationScores(Clustering truth, Clustering predicted) {
    double precision = 0.0;
    double recall = 0.0;


    InstanceList instances = truth.getInstances();


    for (int i = 0; i < instances.size(); i++) {
      int trueLabel = truth.getLabel(i);
      int predLabel = predicted.getLabel(i);
      int[] trueIndices = truth.getIndicesWithLabel(trueLabel);
      int[] predIndices = predicted.getIndicesWithLabel(predLabel);


      int correct = 0;
      for (int j = 0; j < predIndices.length; j++) {
        for (int k = 0; k < trueIndices.length; k++)
          if (trueIndices[k] == predIndices[j])
            correct++;
      }      
      precision += (double)correct / predIndices.length;
      recall += (double)correct / trueIndices.length;    
    }


    macroPrecision += precision;
    macroRecall += recall;
    macroNumInstances += instances.size();


    precision /= instances.size();
    recall /= instances.size();
    return new double[]{precision, recall, (2 * precision * recall / (precision + recall))};
  }

View Full Code Here

   * @param lj
   * @return A new {@link InstanceList} where <code>lj</code> is appended to <code>li</code>.
   */
  public static InstanceList combineLists (InstanceList li,
                                           InstanceList lj) {
    InstanceList newList = new InstanceList(li.getPipe());
    for (int i = 0; i < li.size(); i++) 
      newList.add(li.get(i));
    for (int i = 0; i < lj.size(); i++) 
      newList.add(lj.get(i));
    return newList;
  }

View Full Code Here

                                          int labeli, int labelj) {
    if (labeli == labelj)
      return clustering;
    
    // Set all labelj labels to labeli.
    InstanceList instances = clustering.getInstances();    
    for (int i = 0; i < instances.size(); i++) {
      int idx = clustering.getLabel(i);
      if (idx == labelj)
        clustering.setLabel(i, labeli);
    }
    clustering.setNumLabels(clustering.getNumClusters() - 1);


    // Decrement cluster indices that are greater than the number of clusters.
    for (int i = 0; i < instances.size(); i++) {
      int idx = clustering.getLabel(i);
      if (idx > labelj)
        clustering.setLabel(i, idx - 1);
    }

View Full Code Here

   * @param i
   * @param j
   * @return A new {@link InstanceList} containing the two argument {@link Instance}s.
   */
  public static InstanceList makeList (Instance i, Instance j) {
    InstanceList list = new InstanceList(new Noop(i.getDataAlphabet(), i.getTargetAlphabet()));
    list.add(i);
    list.add(j);
    return list;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of cc.mallet.types.InstanceList

cc.mallet.classify.C45$Node

cc.mallet.classify.DecisionTree$Node

cc.mallet.classify.MaxEntPRTrainer

cc.mallet.classify.NaiveBayesEMTrainer

cc.mallet.classify.tui.Vectors2FeatureConstraints

cc.mallet.cluster.Clustering

cc.mallet.cluster.evaluate.BCubedEvaluator

cc.mallet.cluster.evaluate.tests.TestClusteringEvaluators

cc.mallet.cluster.examples.FirstOrderClusterExample

cc.mallet.cluster.examples.FirstOrderClusterExample$OverlappingFeaturePipe

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.