Examples of cc.mallet.types.InstanceList

cc.mallet.types.InstanceList
A list of machine learning instances, typically used for training or testing of a machine learning algorithm.
All of the instances in the list will have been passed through the same {@link cc.mallet.pipe.Pipe}, and thus must also share the same data and target Alphabets. InstanceList keeps a reference to the pipe and the two alphabets.
The most common way of adding instances to an InstanceList is through the add(PipeInputIterator) method. PipeInputIterators are a way of mapping general data sources into instances suitable for processing through a pipe. As each {@link cc.mallet.types.Instance} is pulled from the PipeInputIterator, the InstanceListcopies the instance and runs the copy through its pipe (with resultant destructive modifications) before saving the modified instance on its list. This is the usual way in which instances are transformed by pipes.
InstanceList also contains methods for randomly generating lists of feature vectors; splitting lists into non-overlapping subsets (useful for test/train splits), and iterators for cross validation. @see Instance @see Pipe @author Andrew McCallum mccallum@cs.umass.edu


    if (printOption.value) {
      for (int i = 0; i < clusterings.size(); i++) {
        Clustering c = clusterings.get(i);
        for (int j = 0; j < c.getNumClusters(); j++) {
          InstanceList cluster = c.getCluster(j);
          for (int k = 0; k < cluster.size(); k++) {
            System.out.println("clustering " + i + " cluster " + j + " element " + k + " " + cluster.get(k).getData());
          }
          System.out.println();
        }
      }
    }

View Full Code Here

      Alphabet fieldAlph = new Alphabet();
      Alphabet valueAlph = new Alphabet();
      File directory = new File(classDirs.value[i]);
      File[] subdirs = getSubDirs(directory);
      Alphabet clusterAlph = new Alphabet();
      InstanceList instances = new InstanceList(new Noop());
      TIntArrayList labels = new TIntArrayList();
      for (int j = 0; j < subdirs.length; j++) {
        ArrayList<File> records = new FileIterator(subdirs[j]).getFileArray();
        int label = clusterAlph.lookupIndex(subdirs[j].toString());
        for (int k = 0; k < records.size(); k++) {
          if (fi % 100 == 0) System.out.print(fi);
          else if (fi % 10 == 0) System.out.print(".");
          if (fi % 1000 == 0 && fi > 0) System.out.println();
          System.out.flush();
          fi++;




          File record = records.get(k);
          labels.add(label);
          instances.add(new Instance(new Record(fieldAlph, valueAlph, parseFile(record)),
                        new Integer(label), record.toString(),
                        record.toString()));
        }
      }
      clusterings[i] =

View Full Code Here

    CommandOption.setSummary (Vectors2Topics.class,
                  "A tool for estimating, saving and printing diagnostics for topic models, such as LDA.");
    CommandOption.process (Vectors2Topics.class, args);


    if (usePAM.value) {
      InstanceList ilist = InstanceList.load (new File(inputFile.value));
      System.out.println ("Data loaded.");
      if (inputModelFilename.value != null)
        throw new IllegalArgumentException ("--input-model not supported with --use-pam.");
      PAM4L pam = new PAM4L(pamNumSupertopics.value, pamNumSubtopics.value);
      pam.estimate (ilist, numIterations.value, /*optimizeModelInterval*/50,
              showTopicsInterval.value,
              outputModelInterval.value, outputModelFilename.value, 
              randomSeed.value == 0 ? new Randoms() : new Randoms(randomSeed.value));
      pam.printTopWords(topWords.value, true);
      if (stateFile.value != null)
        pam.printState (new File(stateFile.value));
      if (docTopicsFile.value != null) {
        PrintWriter out = new PrintWriter (new FileWriter ((new File(docTopicsFile.value))));
        pam.printDocumentTopics (out, docTopicsThreshold.value, docTopicsMax.value);
        out.close();
      }


      
      if (outputModelFilename.value != null) {
        assert (pam != null);
        try {
          ObjectOutputStream oos = new ObjectOutputStream (new FileOutputStream (outputModelFilename.value));
          oos.writeObject (pam);
          oos.close();
        } catch (Exception e) {
          e.printStackTrace();
          throw new IllegalArgumentException ("Couldn't write topic model to filename "+outputModelFilename.value);
        }
      }
      


    }
    
    else if (useNgrams.value) {
      InstanceList ilist = InstanceList.load (new File(inputFile.value));
      System.out.println ("Data loaded.");
      if (inputModelFilename.value != null)
        throw new IllegalArgumentException ("--input-model not supported with --use-ngrams.");
      TopicalNGrams tng = new TopicalNGrams(numTopics.value,
                          alpha.value,
                          beta.value,
                          gamma.value,
                          delta.value,
                          delta1.value,
                          delta2.value);
      tng.estimate (ilist, numIterations.value, showTopicsInterval.value,
              outputModelInterval.value, outputModelFilename.value, 
              randomSeed.value == 0 ? new Randoms() : new Randoms(randomSeed.value));
      tng.printTopWords(topWords.value, true);
      if (stateFile.value != null)
        tng.printState (new File(stateFile.value));
      if (docTopicsFile.value != null) {
        PrintWriter out = new PrintWriter (new FileWriter ((new File(docTopicsFile.value))));
        tng.printDocumentTopics (out, docTopicsThreshold.value, docTopicsMax.value);
        out.close();
      }


      if (outputModelFilename.value != null) {
        assert (tng != null);
        try {
          ObjectOutputStream oos = new ObjectOutputStream (new FileOutputStream (outputModelFilename.value));
          oos.writeObject (tng);
          oos.close();
        } catch (Exception e) {
          e.printStackTrace();
          throw new IllegalArgumentException ("Couldn't write topic model to filename "+outputModelFilename.value);
        }
      }
      
    }
    else if (languageInputFiles.value != null) {
      // Start a new polylingual topic model
      
      PolylingualTopicModel topicModel = null;


      int numLanguages = languageInputFiles.value.length;


      InstanceList[] training = new InstanceList[ languageInputFiles.value.length ];
      for (int i=0; i < training.length; i++) {
        training[i] = InstanceList.load(new File(languageInputFiles.value[i]));
        if (training[i] != null) { System.out.println(i + " is not null"); }
        else { System.out.println(i + " is null"); }
      }


      System.out.println ("Data loaded.");
      
      // For historical reasons we currently only support FeatureSequence data,
      //  not the FeatureVector, which is the default for the input functions.
      //  Provide a warning to avoid ClassCastExceptions.
      if (training[0].size() > 0 &&
        training[0].get(0) != null) {
        Object data = training[0].get(0).getData();
        if (! (data instanceof FeatureSequence)) {
          System.err.println("Topic modeling currently only supports feature sequences: use --keep-sequence option when importing data.");
          System.exit(1);
        }
      }
      
      topicModel = new PolylingualTopicModel (numTopics.value, alpha.value);
      if (randomSeed.value != 0) {
        topicModel.setRandomSeed(randomSeed.value);
      }
      
      topicModel.addInstances(training);


      topicModel.setTopicDisplay(showTopicsInterval.value, topWords.value);


      topicModel.setNumIterations(numIterations.value);
      topicModel.setOptimizeInterval(optimizeInterval.value);
      topicModel.setBurninPeriod(optimizeBurnIn.value);


      if (outputStateInterval.value != 0) {
        topicModel.setSaveState(outputStateInterval.value, stateFile.value);
      }


      if (outputModelInterval.value != 0) {
        topicModel.setModelOutput(outputModelInterval.value, outputModelFilename.value);
      }


      topicModel.estimate();


      if (topicKeysFile.value != null) {
        topicModel.printTopWords(new File(topicKeysFile.value), topWords.value, false);
      }


      if (stateFile.value != null) {
        topicModel.printState (new File(stateFile.value));
      }


      if (docTopicsFile.value != null) {
        PrintWriter out = new PrintWriter (new FileWriter ((new File(docTopicsFile.value))));
        topicModel.printDocumentTopics(out, docTopicsThreshold.value, docTopicsMax.value);
        out.close();
      }


            if (inferencerFilename.value != null) {
                try {
          for (int language = 0; language < numLanguages; language++) {


            ObjectOutputStream oos =
              new ObjectOutputStream(new FileOutputStream(inferencerFilename.value + "." + language));
            oos.writeObject(topicModel.getInferencer(language));
            oos.close();
          }


                } catch (Exception e) {
                    System.err.println(e.getMessage());
                }


            }


      if (outputModelFilename.value != null) {
        assert (topicModel != null);
        try {


          ObjectOutputStream oos =
            new ObjectOutputStream (new FileOutputStream (outputModelFilename.value));
          oos.writeObject (topicModel);
          oos.close();


        } catch (Exception e) {
          e.printStackTrace();
          throw new IllegalArgumentException ("Couldn't write topic model to filename "+outputModelFilename.value);
        }
      }


    }
    else {


      // Start a new LDA topic model
      
      ParallelTopicModel topicModel = null;


      if (inputModelFilename.value != null) {
        
        try {
          topicModel = ParallelTopicModel.read(new File(inputModelFilename.value));
        } catch (Exception e) {
          System.err.println("Unable to restore saved topic model " + 
                     inputModelFilename.value + ": " + e);
          System.exit(1);
        }
        /*
        // Loading new data is optional if we are restoring a saved state.
        if (inputFile.value != null) {
          InstanceList instances = InstanceList.load (new File(inputFile.value));
          System.out.println ("Data loaded.");
          lda.addInstances(instances);
        }
        */
      } 
      else {
        InstanceList training = null;
        try {
          if (inputFile.value.startsWith("db:")) {
            training = DBInstanceIterator.getInstances(inputFile.value.substring(3));
          }
          else {
            training = InstanceList.load (new File(inputFile.value));
          }
        } catch (Exception e) {
          System.err.println("Unable to restore instance list " + 
                     inputFile.value + ": " + e);
          System.exit(1);          
        }


        System.out.println ("Data loaded.");


        if (training.size() > 0 &&
          training.get(0) != null) {
          Object data = training.get(0).getData();
          if (! (data instanceof FeatureSequence)) {
            System.err.println("Topic modeling currently only supports feature sequences: use --keep-sequence option when importing data.");
            System.exit(1);
          }
        }

View Full Code Here

    if (inputFile.value() == null) {
      System.err.println("Input instance list is required, use --input option");
      System.exit(1);
    }


    InstanceList instances = InstanceList.load(new File(inputFile.value()));
    InstanceList testing = null;
    if (testingFile.value() != null) {
      testing = InstanceList.load(new File(testingFile.value()));
    }
  
    HierarchicalLDA hlda = new HierarchicalLDA();

View Full Code Here

    if (randomSeed.value != 0) {
      topicModel.setRandomSeed(randomSeed.value);
    }


    if (inputFile.value != null) {
      InstanceList training = null;
      try {
        if (inputFile.value.startsWith("db:")) {
          training = DBInstanceIterator.getInstances(inputFile.value.substring(3));
        }
        else {
          training = InstanceList.load (new File(inputFile.value));
        }
      } catch (Exception e) {
        logger.warning("Unable to restore instance list " +
                   inputFile.value + ": " + e);
        System.exit(1);
      }


      logger.info("Data loaded.");
      
      if (training.size() > 0 &&
        training.get(0) != null) {
        Object data = training.get(0).getData();
        if (! (data instanceof FeatureSequence)) {
          logger.warning("Topic modeling currently only supports feature sequences: use --keep-sequence option when importing data.");
          System.exit(1);
        }
      }

View Full Code Here

    {
      // Print out some feature information
      logger.info ("Feature induction iteration "+featureInductionIteration);


      // Train the CRF
      InstanceList theTrainingData = trainingData;
      if (trainingProportions != null && featureInductionIteration < trainingProportions.length) {
        logger.info ("Training on "+trainingProportions[featureInductionIteration]+"% of the data this round.");
        InstanceList[] sampledTrainingData = trainingData.split (new Random(1),
            new double[] {trainingProportions[featureInductionIteration],
          1-trainingProportions[featureInductionIteration]});
        theTrainingData = sampledTrainingData[0];
        theTrainingData.setFeatureSelection (crf.globalFeatureSelection); // xxx necessary?
            logger.info ("  which is "+theTrainingData.size()+" instances");
      }
      boolean converged = false;
      if (featureInductionIteration != 0)
        // Don't train until we have added some features
        converged = this.train (theTrainingData, numIterationsBetweenFeatureInductions);
      trainingIteration += numIterationsBetweenFeatureInductions;


      logger.info ("Starting feature induction with "+crf.inputAlphabet.size()+" features.");


      // Create the list of error tokens, for both unclustered and clustered feature induction
      InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(),
          trainingData.getTargetAlphabet());
      // This errorInstances.featureSelection will get examined by FeatureInducer,
      // so it can know how to add "new" singleton features
      errorInstances.setFeatureSelection (crf.globalFeatureSelection);
      ArrayList errorLabelVectors = new ArrayList();
      InstanceList clusteredErrorInstances[][] = new InstanceList[numLabels][numLabels];
      ArrayList clusteredErrorLabelVectors[][] = new ArrayList[numLabels][numLabels];


      for (int i = 0; i < numLabels; i++)
        for (int j = 0; j < numLabels; j++) {
          clusteredErrorInstances[i][j] = new InstanceList (trainingData.getDataAlphabet(),
              trainingData.getTargetAlphabet());
          clusteredErrorInstances[i][j].setFeatureSelection (crf.globalFeatureSelection);
          clusteredErrorLabelVectors[i][j] = new ArrayList();
        }

View Full Code Here

      constrainedInstances.or(constraint.preProcess(train));
      constraint.setStateLabelMap(stateLabelMap);
    }
    
    int removed = 0;
    InstanceList tempTrain = train.cloneEmpty();
    for (int ii = 0; ii < train.size(); ii++) {
      if (constrainedInstances.get(ii)) {
        tempTrain.add(train.get(ii));
      }
      else {
        removed++;
      }
    }

View Full Code Here

    String[] stateNames = new String[numStates];
    for (int i = 0; i < numStates; i++)
      stateNames[i] = "state" + i;
    memm.addFullyConnectedStates(stateNames);
    MEMMTrainer memmt = new MEMMTrainer (memm);
    MEMMTrainer.MEMMOptimizableByLabelLikelihood omemm = memmt.getOptimizableMEMM (new InstanceList(null));
    TestOptimizable.testGetSetParameters(omemm);
  }

View Full Code Here

  }


  public void testSpaceMaximizable ()
  {
    Pipe p = makeSpacePredictionPipe ();
    InstanceList training = new InstanceList (p);
//    String[] data = { TestMEMM.data[0], }; // TestMEMM.data[1], TestMEMM.data[2], TestMEMM.data[3], };
//    String[] data = { "ab" };
    training.addThruPipe (new ArrayIterator (data));


//    CRF4 memm = new CRF4 (p, null);
    MEMM memm = new MEMM (p, null);
    memm.addFullyConnectedStatesForLabels ();
    memm.addStartState();

View Full Code Here

  }


  public void testSpaceSerializable () throws IOException, ClassNotFoundException
  {
    Pipe p = makeSpacePredictionPipe ();
    InstanceList training = new InstanceList (p);
    training.addThruPipe (new ArrayIterator (data));


    MEMM memm = new MEMM (p, null);
    memm.addFullyConnectedStatesForLabels ();
    memm.addStartState();
    memm.setWeightsDimensionAsIn(training);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of cc.mallet.types.InstanceList

cc.mallet.classify.C45$Node

cc.mallet.classify.DecisionTree$Node

cc.mallet.classify.MaxEntPRTrainer

cc.mallet.classify.NaiveBayesEMTrainer

cc.mallet.classify.tui.Vectors2FeatureConstraints

cc.mallet.cluster.Clustering

cc.mallet.cluster.evaluate.BCubedEvaluator

cc.mallet.cluster.evaluate.tests.TestClusteringEvaluators

cc.mallet.cluster.examples.FirstOrderClusterExample

cc.mallet.cluster.examples.FirstOrderClusterExample$OverlappingFeaturePipe

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.