Examples of com.digitalpebble.classification.Document

Package com.digitalpebble.classification

Examples of com.digitalpebble.classification.Document

com.digitalpebble.classification.Document


        // get an iterator on the Corpus
        // and retrieve the documents one by one
        Iterator<Document> docIterator = corpus.iterator();
        while (docIterator.hasNext()) {
            Document doc = docIterator.next();
            int label = doc.getLabel();
            // get a vector from the document
            // need a metric (e.g. relative frequency / binary)
            // and a lexicon
            // the vector is represented as a string directly
            Vector vector = null;
            if (attributeMapping == null)
                vector = doc.getFeatureVector(lexicon);
            else
                vector = doc.getFeatureVector(lexicon, attributeMapping);


            StringBuffer buffer = new StringBuffer();


            buffer.append(lexicon.getLabel(label));

View Full Code Here

      // lower case
      for (int i=0;i<tokens.length;i++){
        tokens[i]=tokens[i].toLowerCase();
      }      
      
      Document doc = null;
      if (operator instanceof Learner)
      doc = ((Learner)operator).createDocument(tokens,label);
      else 
        doc = ((TextClassifier)operator).createDocument(tokens);
      corpusList.add(doc);

View Full Code Here

        "title" });
    fields[1] = new Field("abstract", new String[] { "abstract" });
    fields[2] = new Field("content", new String[] { "This", "is", "the",
        "content", "this", "will", "have", "a", "large", "value" });
    learner.setMethod(Parameters.WeightingMethod.TFIDF);
    Document doc = learner.createDocument(fields, "large");


    Field[] fields2 = new Field[2];
    fields2[0] = new Field("title", new String[] { "This", "is", "not",
        "a", "title" });
    // fields2[1] = new Field("abstract", new String[]{"abstract"});
    fields2[1] = new Field("content", new String[] { "This", "is", "the",
        "content", "this", "will", "have", "a", "small", "value" });
    learner.setMethod(Parameters.WeightingMethod.TFIDF);
    Document doc2 = learner.createDocument(fields2, "small");


    // try putting the same field several times
    Field[] fields3 = new Field[3];
    fields3[0] = new Field("title", new String[] { "This", "is", "not",
        "a", "title" });
    // fields2[1] = new Field("abstract", new String[]{"abstract"});
    fields3[1] = new Field("content", new String[] { "This", "is", "the",
        "content", "this", "will", "have", "a", "small", "value" });
    fields3[2] = new Field("title", new String[] { "some", "different",
        "content" });
    learner.setMethod(Parameters.WeightingMethod.TFIDF);
    Document doc3 = learner.createDocument(fields3, "small");


    RAMTrainingCorpus corpus = new RAMTrainingCorpus();
    corpus.add(doc);
    corpus.add(doc2);
    learner.learn(corpus);

View Full Code Here

  public void testCustomWeightingScheme() throws Exception {
    Field[] fields = new Field[1];
    fields[0] = new Field("keywords", new String[] { "test","keywords"});
    learner.setMethod(Parameters.WeightingMethod.FREQUENCY);
    learner.getLexicon().setMethod(Parameters.WeightingMethod.BOOLEAN, "keywords");
    Document doc = learner.createDocument(fields, "large");
    Vector vector = doc.getFeatureVector(learner.getLexicon());
    
    // check that the values for the field keywords are boolean
    int[] indices = vector.getIndices();
    double[] values = vector.getValues();

View Full Code Here

    fields[0] = new Field("title", new String[] { "This", "is", "a",
        "title" });
    fields[1] = new Field("abstract", new String[] { "abstract" });
    fields[2] = new Field("content", new String[] { "This", "is", "the",
        "content", "this", "will", "have", "a", "large", "value" });
    Document doc = learner.createDocument(fields, "large");


    String[] simplecontent = new String[] { "This", "is", "the", "content",
        "this", "will", "have", "a", "small", "value" };
    Document doc2 = learner.createDocument(simplecontent, "small");


    // com.digitalpebble.classification.TrainingCorpus tc =
    // learner.getFileTrainingCorpus();
    com.digitalpebble.classification.TrainingCorpus tc = new RAMTrainingCorpus();
    tc.addDocument(doc);

View Full Code Here

    RAMTrainingCorpus corpus = new RAMTrainingCorpus();


    learner.setMethod(method);


    for (String[] content : docs) {
      Document doc = learner.createDocument(content);
      corpus.add(doc);
    }


    Iterator<Document> corpusIter = corpus.iterator();


    Map<Integer, String> invertedIndex = learner.getLexicon()
        .getInvertedIndex();


    List<Map> expectedset = references.get(method);


    // check that we have the same number of docs in the corpus
    // and in the ref


    assertEquals(expectedset.size(), corpus.size());


    for (Map<String, Double> ref : expectedset) {
      Document doc = corpusIter.next();
      Vector vector = doc.getFeatureVector(learner.getLexicon());
      // now let's compare what we wanted to have with the content of the
      // vector
      int[] indices = vector.getIndices();
      double[] values = vector.getValues();

View Full Code Here

0 1

TOP

Related Classes of com.digitalpebble.classification.Document

com.digitalpebble.classification.libsvm.Utils

com.digitalpebble.classification.test.classifyDoc

com.digitalpebble.classification.test.TestMultiFieldDocs

com.digitalpebble.classification.test.TestTrainingCorpus

com.digitalpebble.classification.test.TestWeightingSchemes

com.digitalpebble.classification.util.ModelUtils

com.digitalpebble.classification.util.scorers.chiSquareAttributeScorer

com.digitalpebble.classification.util.scorers.logLikelihoodAttributeScorer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.