Package com.digitalpebble.classification

Examples of com.digitalpebble.classification.Document


        // get an iterator on the Corpus
        // and retrieve the documents one by one
        Iterator<Document> docIterator = corpus.iterator();
        while (docIterator.hasNext()) {
            Document doc = docIterator.next();
            int label = doc.getLabel();
            // get a vector from the document
            // need a metric (e.g. relative frequency / binary)
            // and a lexicon
            // the vector is represented as a string directly
            Vector vector = null;
            if (attributeMapping == null)
                vector = doc.getFeatureVector(lexicon);
            else
                vector = doc.getFeatureVector(lexicon, attributeMapping);

            StringBuffer buffer = new StringBuffer();

            buffer.append(lexicon.getLabel(label));
View Full Code Here


      // lower case
      for (int i=0;i<tokens.length;i++){
        tokens[i]=tokens[i].toLowerCase();
      }     
     
      Document doc = null;
      if (operator instanceof Learner)
      doc = ((Learner)operator).createDocument(tokens,label);
      else
        doc = ((TextClassifier)operator).createDocument(tokens);
      corpusList.add(doc);
View Full Code Here

        "title" });
    fields[1] = new Field("abstract", new String[] { "abstract" });
    fields[2] = new Field("content", new String[] { "This", "is", "the",
        "content", "this", "will", "have", "a", "large", "value" });
    learner.setMethod(Parameters.WeightingMethod.TFIDF);
    Document doc = learner.createDocument(fields, "large");

    Field[] fields2 = new Field[2];
    fields2[0] = new Field("title", new String[] { "This", "is", "not",
        "a", "title" });
    // fields2[1] = new Field("abstract", new String[]{"abstract"});
    fields2[1] = new Field("content", new String[] { "This", "is", "the",
        "content", "this", "will", "have", "a", "small", "value" });
    learner.setMethod(Parameters.WeightingMethod.TFIDF);
    Document doc2 = learner.createDocument(fields2, "small");

    // try putting the same field several times
    Field[] fields3 = new Field[3];
    fields3[0] = new Field("title", new String[] { "This", "is", "not",
        "a", "title" });
    // fields2[1] = new Field("abstract", new String[]{"abstract"});
    fields3[1] = new Field("content", new String[] { "This", "is", "the",
        "content", "this", "will", "have", "a", "small", "value" });
    fields3[2] = new Field("title", new String[] { "some", "different",
        "content" });
    learner.setMethod(Parameters.WeightingMethod.TFIDF);
    Document doc3 = learner.createDocument(fields3, "small");

    RAMTrainingCorpus corpus = new RAMTrainingCorpus();
    corpus.add(doc);
    corpus.add(doc2);
    learner.learn(corpus);
View Full Code Here

  public void testCustomWeightingScheme() throws Exception {
    Field[] fields = new Field[1];
    fields[0] = new Field("keywords", new String[] { "test","keywords"});
    learner.setMethod(Parameters.WeightingMethod.FREQUENCY);
    learner.getLexicon().setMethod(Parameters.WeightingMethod.BOOLEAN, "keywords");
    Document doc = learner.createDocument(fields, "large");
    Vector vector = doc.getFeatureVector(learner.getLexicon());
   
    // check that the values for the field keywords are boolean
    int[] indices = vector.getIndices();
    double[] values = vector.getValues();
   
View Full Code Here

    fields[0] = new Field("title", new String[] { "This", "is", "a",
        "title" });
    fields[1] = new Field("abstract", new String[] { "abstract" });
    fields[2] = new Field("content", new String[] { "This", "is", "the",
        "content", "this", "will", "have", "a", "large", "value" });
    Document doc = learner.createDocument(fields, "large");

    String[] simplecontent = new String[] { "This", "is", "the", "content",
        "this", "will", "have", "a", "small", "value" };
    Document doc2 = learner.createDocument(simplecontent, "small");

    // com.digitalpebble.classification.TrainingCorpus tc =
    // learner.getFileTrainingCorpus();
    com.digitalpebble.classification.TrainingCorpus tc = new RAMTrainingCorpus();
    tc.addDocument(doc);
View Full Code Here

    RAMTrainingCorpus corpus = new RAMTrainingCorpus();

    learner.setMethod(method);

    for (String[] content : docs) {
      Document doc = learner.createDocument(content);
      corpus.add(doc);
    }

    Iterator<Document> corpusIter = corpus.iterator();

    Map<Integer, String> invertedIndex = learner.getLexicon()
        .getInvertedIndex();

    List<Map> expectedset = references.get(method);

    // check that we have the same number of docs in the corpus
    // and in the ref

    assertEquals(expectedset.size(), corpus.size());

    for (Map<String, Double> ref : expectedset) {
      Document doc = corpusIter.next();
      Vector vector = doc.getFeatureVector(learner.getLexicon());
      // now let's compare what we wanted to have with the content of the
      // vector
      int[] indices = vector.getIndices();
      double[] values = vector.getValues();
View Full Code Here

TOP

Related Classes of com.digitalpebble.classification.Document

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.