Package cc.mallet.types

Examples of cc.mallet.types.InstanceList


    if (printOption.value) {
      for (int i = 0; i < clusterings.size(); i++) {
        Clustering c = clusterings.get(i);
        for (int j = 0; j < c.getNumClusters(); j++) {
          InstanceList cluster = c.getCluster(j);
          for (int k = 0; k < cluster.size(); k++) {
            System.out.println("clustering " + i + " cluster " + j + " element " + k + " " + cluster.get(k).getData());
          }
          System.out.println();
        }
      }
    }
View Full Code Here


      Alphabet fieldAlph = new Alphabet();
      Alphabet valueAlph = new Alphabet();
      File directory = new File(classDirs.value[i]);
      File[] subdirs = getSubDirs(directory);
      Alphabet clusterAlph = new Alphabet();
      InstanceList instances = new InstanceList(new Noop());
      TIntArrayList labels = new TIntArrayList();
      for (int j = 0; j < subdirs.length; j++) {
        ArrayList<File> records = new FileIterator(subdirs[j]).getFileArray();
        int label = clusterAlph.lookupIndex(subdirs[j].toString());
        for (int k = 0; k < records.size(); k++) {
          if (fi % 100 == 0) System.out.print(fi);
          else if (fi % 10 == 0) System.out.print(".");
          if (fi % 1000 == 0 && fi > 0) System.out.println();
          System.out.flush();
          fi++;


          File record = records.get(k);
          labels.add(label);
          instances.add(new Instance(new Record(fieldAlph, valueAlph, parseFile(record)),
                        new Integer(label), record.toString(),
                        record.toString()));
        }
      }
      clusterings[i] =
View Full Code Here

    CommandOption.setSummary (Vectors2Topics.class,
                  "A tool for estimating, saving and printing diagnostics for topic models, such as LDA.");
    CommandOption.process (Vectors2Topics.class, args);

    if (usePAM.value) {
      InstanceList ilist = InstanceList.load (new File(inputFile.value));
      System.out.println ("Data loaded.");
      if (inputModelFilename.value != null)
        throw new IllegalArgumentException ("--input-model not supported with --use-pam.");
      PAM4L pam = new PAM4L(pamNumSupertopics.value, pamNumSubtopics.value);
      pam.estimate (ilist, numIterations.value, /*optimizeModelInterval*/50,
              showTopicsInterval.value,
              outputModelInterval.value, outputModelFilename.value,
              randomSeed.value == 0 ? new Randoms() : new Randoms(randomSeed.value));
      pam.printTopWords(topWords.value, true);
      if (stateFile.value != null)
        pam.printState (new File(stateFile.value));
      if (docTopicsFile.value != null) {
        PrintWriter out = new PrintWriter (new FileWriter ((new File(docTopicsFile.value))));
        pam.printDocumentTopics (out, docTopicsThreshold.value, docTopicsMax.value);
        out.close();
      }

     
      if (outputModelFilename.value != null) {
        assert (pam != null);
        try {
          ObjectOutputStream oos = new ObjectOutputStream (new FileOutputStream (outputModelFilename.value));
          oos.writeObject (pam);
          oos.close();
        } catch (Exception e) {
          e.printStackTrace();
          throw new IllegalArgumentException ("Couldn't write topic model to filename "+outputModelFilename.value);
        }
      }
     

    }
   
    else if (useNgrams.value) {
      InstanceList ilist = InstanceList.load (new File(inputFile.value));
      System.out.println ("Data loaded.");
      if (inputModelFilename.value != null)
        throw new IllegalArgumentException ("--input-model not supported with --use-ngrams.");
      TopicalNGrams tng = new TopicalNGrams(numTopics.value,
                          alpha.value,
                          beta.value,
                          gamma.value,
                          delta.value,
                          delta1.value,
                          delta2.value);
      tng.estimate (ilist, numIterations.value, showTopicsInterval.value,
              outputModelInterval.value, outputModelFilename.value,
              randomSeed.value == 0 ? new Randoms() : new Randoms(randomSeed.value));
      tng.printTopWords(topWords.value, true);
      if (stateFile.value != null)
        tng.printState (new File(stateFile.value));
      if (docTopicsFile.value != null) {
        PrintWriter out = new PrintWriter (new FileWriter ((new File(docTopicsFile.value))));
        tng.printDocumentTopics (out, docTopicsThreshold.value, docTopicsMax.value);
        out.close();
      }

      if (outputModelFilename.value != null) {
        assert (tng != null);
        try {
          ObjectOutputStream oos = new ObjectOutputStream (new FileOutputStream (outputModelFilename.value));
          oos.writeObject (tng);
          oos.close();
        } catch (Exception e) {
          e.printStackTrace();
          throw new IllegalArgumentException ("Couldn't write topic model to filename "+outputModelFilename.value);
        }
      }
     
    }
    else if (languageInputFiles.value != null) {
      // Start a new polylingual topic model
     
      PolylingualTopicModel topicModel = null;

      int numLanguages = languageInputFiles.value.length;

      InstanceList[] training = new InstanceList[ languageInputFiles.value.length ];
      for (int i=0; i < training.length; i++) {
        training[i] = InstanceList.load(new File(languageInputFiles.value[i]));
        if (training[i] != null) { System.out.println(i + " is not null"); }
        else { System.out.println(i + " is null"); }
      }

      System.out.println ("Data loaded.");
     
      // For historical reasons we currently only support FeatureSequence data,
      //  not the FeatureVector, which is the default for the input functions.
      //  Provide a warning to avoid ClassCastExceptions.
      if (training[0].size() > 0 &&
        training[0].get(0) != null) {
        Object data = training[0].get(0).getData();
        if (! (data instanceof FeatureSequence)) {
          System.err.println("Topic modeling currently only supports feature sequences: use --keep-sequence option when importing data.");
          System.exit(1);
        }
      }
     
      topicModel = new PolylingualTopicModel (numTopics.value, alpha.value);
      if (randomSeed.value != 0) {
        topicModel.setRandomSeed(randomSeed.value);
      }
     
      topicModel.addInstances(training);

      topicModel.setTopicDisplay(showTopicsInterval.value, topWords.value);

      topicModel.setNumIterations(numIterations.value);
      topicModel.setOptimizeInterval(optimizeInterval.value);
      topicModel.setBurninPeriod(optimizeBurnIn.value);

      if (outputStateInterval.value != 0) {
        topicModel.setSaveState(outputStateInterval.value, stateFile.value);
      }

      if (outputModelInterval.value != 0) {
        topicModel.setModelOutput(outputModelInterval.value, outputModelFilename.value);
      }

      topicModel.estimate();

      if (topicKeysFile.value != null) {
        topicModel.printTopWords(new File(topicKeysFile.value), topWords.value, false);
      }

      if (stateFile.value != null) {
        topicModel.printState (new File(stateFile.value));
      }

      if (docTopicsFile.value != null) {
        PrintWriter out = new PrintWriter (new FileWriter ((new File(docTopicsFile.value))));
        topicModel.printDocumentTopics(out, docTopicsThreshold.value, docTopicsMax.value);
        out.close();
      }

            if (inferencerFilename.value != null) {
                try {
          for (int language = 0; language < numLanguages; language++) {

            ObjectOutputStream oos =
              new ObjectOutputStream(new FileOutputStream(inferencerFilename.value + "." + language));
            oos.writeObject(topicModel.getInferencer(language));
            oos.close();
          }

                } catch (Exception e) {
                    System.err.println(e.getMessage());
                }

            }

      if (outputModelFilename.value != null) {
        assert (topicModel != null);
        try {

          ObjectOutputStream oos =
            new ObjectOutputStream (new FileOutputStream (outputModelFilename.value));
          oos.writeObject (topicModel);
          oos.close();

        } catch (Exception e) {
          e.printStackTrace();
          throw new IllegalArgumentException ("Couldn't write topic model to filename "+outputModelFilename.value);
        }
      }

    }
    else {

      // Start a new LDA topic model
     
      ParallelTopicModel topicModel = null;

      if (inputModelFilename.value != null) {
       
        try {
          topicModel = ParallelTopicModel.read(new File(inputModelFilename.value));
        } catch (Exception e) {
          System.err.println("Unable to restore saved topic model " +
                     inputModelFilename.value + ": " + e);
          System.exit(1);
        }
        /*
        // Loading new data is optional if we are restoring a saved state.
        if (inputFile.value != null) {
          InstanceList instances = InstanceList.load (new File(inputFile.value));
          System.out.println ("Data loaded.");
          lda.addInstances(instances);
        }
        */
      }
      else {
        InstanceList training = null;
        try {
          if (inputFile.value.startsWith("db:")) {
            training = DBInstanceIterator.getInstances(inputFile.value.substring(3));
          }
          else {
            training = InstanceList.load (new File(inputFile.value));
          }
        } catch (Exception e) {
          System.err.println("Unable to restore instance list " +
                     inputFile.value + ": " + e);
          System.exit(1);         
        }

        System.out.println ("Data loaded.");

        if (training.size() > 0 &&
          training.get(0) != null) {
          Object data = training.get(0).getData();
          if (! (data instanceof FeatureSequence)) {
            System.err.println("Topic modeling currently only supports feature sequences: use --keep-sequence option when importing data.");
            System.exit(1);
          }
        }
View Full Code Here

    if (inputFile.value() == null) {
      System.err.println("Input instance list is required, use --input option");
      System.exit(1);
    }

    InstanceList instances = InstanceList.load(new File(inputFile.value()));
    InstanceList testing = null;
    if (testingFile.value() != null) {
      testing = InstanceList.load(new File(testingFile.value()));
    }
 
    HierarchicalLDA hlda = new HierarchicalLDA();
View Full Code Here

    if (randomSeed.value != 0) {
      topicModel.setRandomSeed(randomSeed.value);
    }

    if (inputFile.value != null) {
      InstanceList training = null;
      try {
        if (inputFile.value.startsWith("db:")) {
          training = DBInstanceIterator.getInstances(inputFile.value.substring(3));
        }
        else {
          training = InstanceList.load (new File(inputFile.value));
        }
      } catch (Exception e) {
        logger.warning("Unable to restore instance list " +
                   inputFile.value + ": " + e);
        System.exit(1);
      }

      logger.info("Data loaded.");
     
      if (training.size() > 0 &&
        training.get(0) != null) {
        Object data = training.get(0).getData();
        if (! (data instanceof FeatureSequence)) {
          logger.warning("Topic modeling currently only supports feature sequences: use --keep-sequence option when importing data.");
          System.exit(1);
        }
      }
View Full Code Here

    {
      // Print out some feature information
      logger.info ("Feature induction iteration "+featureInductionIteration);

      // Train the CRF
      InstanceList theTrainingData = trainingData;
      if (trainingProportions != null && featureInductionIteration < trainingProportions.length) {
        logger.info ("Training on "+trainingProportions[featureInductionIteration]+"% of the data this round.");
        InstanceList[] sampledTrainingData = trainingData.split (new Random(1),
            new double[] {trainingProportions[featureInductionIteration],
          1-trainingProportions[featureInductionIteration]});
        theTrainingData = sampledTrainingData[0];
        theTrainingData.setFeatureSelection (crf.globalFeatureSelection); // xxx necessary?
            logger.info ("  which is "+theTrainingData.size()+" instances");
      }
      boolean converged = false;
      if (featureInductionIteration != 0)
        // Don't train until we have added some features
        converged = this.train (theTrainingData, numIterationsBetweenFeatureInductions);
      trainingIteration += numIterationsBetweenFeatureInductions;

      logger.info ("Starting feature induction with "+crf.inputAlphabet.size()+" features.");

      // Create the list of error tokens, for both unclustered and clustered feature induction
      InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(),
          trainingData.getTargetAlphabet());
      // This errorInstances.featureSelection will get examined by FeatureInducer,
      // so it can know how to add "new" singleton features
      errorInstances.setFeatureSelection (crf.globalFeatureSelection);
      ArrayList errorLabelVectors = new ArrayList();
      InstanceList clusteredErrorInstances[][] = new InstanceList[numLabels][numLabels];
      ArrayList clusteredErrorLabelVectors[][] = new ArrayList[numLabels][numLabels];

      for (int i = 0; i < numLabels; i++)
        for (int j = 0; j < numLabels; j++) {
          clusteredErrorInstances[i][j] = new InstanceList (trainingData.getDataAlphabet(),
              trainingData.getTargetAlphabet());
          clusteredErrorInstances[i][j].setFeatureSelection (crf.globalFeatureSelection);
          clusteredErrorLabelVectors[i][j] = new ArrayList();
        }
View Full Code Here

      constrainedInstances.or(constraint.preProcess(train));
      constraint.setStateLabelMap(stateLabelMap);
    }
   
    int removed = 0;
    InstanceList tempTrain = train.cloneEmpty();
    for (int ii = 0; ii < train.size(); ii++) {
      if (constrainedInstances.get(ii)) {
        tempTrain.add(train.get(ii));
      }
      else {
        removed++;
      }
    }
View Full Code Here

    String[] stateNames = new String[numStates];
    for (int i = 0; i < numStates; i++)
      stateNames[i] = "state" + i;
    memm.addFullyConnectedStates(stateNames);
    MEMMTrainer memmt = new MEMMTrainer (memm);
    MEMMTrainer.MEMMOptimizableByLabelLikelihood omemm = memmt.getOptimizableMEMM (new InstanceList(null));
    TestOptimizable.testGetSetParameters(omemm);
  }
View Full Code Here

  }

  public void testSpaceMaximizable ()
  {
    Pipe p = makeSpacePredictionPipe ();
    InstanceList training = new InstanceList (p);
//    String[] data = { TestMEMM.data[0], }; // TestMEMM.data[1], TestMEMM.data[2], TestMEMM.data[3], };
//    String[] data = { "ab" };
    training.addThruPipe (new ArrayIterator (data));

//    CRF4 memm = new CRF4 (p, null);
    MEMM memm = new MEMM (p, null);
    memm.addFullyConnectedStatesForLabels ();
    memm.addStartState();
View Full Code Here

  }

  public void testSpaceSerializable () throws IOException, ClassNotFoundException
  {
    Pipe p = makeSpacePredictionPipe ();
    InstanceList training = new InstanceList (p);
    training.addThruPipe (new ArrayIterator (data));

    MEMM memm = new MEMM (p, null);
    memm.addFullyConnectedStatesForLabels ();
    memm.addStartState();
    memm.setWeightsDimensionAsIn(training);
View Full Code Here

TOP

Related Classes of cc.mallet.types.InstanceList

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.