Examples of TermInfo


Examples of org.apache.mahout.utils.vectors.TermInfo

  @Test
  public void testIterable() throws Exception {
    IndexReader reader = DirectoryReader.open(directory);
    Weight weight = new TFIDF();
    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
    LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight);

    //TODO: do something more meaningful here
    for (Vector vector : iterable) {
      assertNotNull(vector);
View Full Code Here

Examples of org.apache.mahout.utils.vectors.TermInfo

    RAMDirectory directory = createTestIndex(Field.TermVector.NO);
    IndexReader reader = DirectoryReader.open(directory);
   
   
    Weight weight = new TFIDF();
    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
    LuceneIterable iterable = new LuceneIterable(reader, "id", "content",  termInfo,weight);

    Iterator<Vector> iterator = iterable.iterator();
    iterator.hasNext();
    iterator.next();
View Full Code Here

Examples of org.apache.mahout.utils.vectors.TermInfo

    //get real vectors
    createTestIndex(Field.TermVector.NO, directory, false, 5);
    IndexReader reader = DirectoryReader.open(directory);

    Weight weight = new TFIDF();
    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
   
    boolean exceptionThrown;
    //0 percent tolerance
    LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight);
    try {
View Full Code Here

Examples of org.apache.mahout.utils.vectors.TermInfo

      weight = new TFIDF();
    } else {
      throw new IllegalArgumentException("Weight type " + weightType + " is not supported");
    }

    TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
   
    LuceneIterable iterable;
    if (norm == LuceneIterable.NO_NORMALIZING) {
      iterable = new LuceneIterable(reader, idField, field, termInfo, weight, LuceneIterable.NO_NORMALIZING,
          maxPercentErrorDocs);
    } else {
      iterable = new LuceneIterable(reader, idField, field, termInfo, weight, norm, maxPercentErrorDocs);
    }

    log.info("Output File: {}", outFile);

    VectorWriter vectorWriter = getSeqFileWriter(outFile);
    try {
      long numDocs = vectorWriter.write(iterable, maxDocs);
      log.info("Wrote: {} vectors", numDocs);
    } finally {
      Closeables.close(vectorWriter, false);
    }

    File dictOutFile = new File(dictOut);
    log.info("Dictionary Output file: {}", dictOutFile);
    Writer writer = Files.newWriter(dictOutFile, Charsets.UTF_8);
    DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer, delimiter, field);
    try {
      tiWriter.write(termInfo);
    } finally {
      Closeables.close(tiWriter, false);
    }

    if (!"".equals(seqDictOut)) {
      log.info("SequenceFile Dictionary Output file: {}", seqDictOut);

      Path path = new Path(seqDictOut);
      Configuration conf = new Configuration();
      FileSystem fs = FileSystem.get(conf);
      SequenceFile.Writer seqWriter = null;
      try {
        seqWriter = SequenceFile.createWriter(fs, conf, path, Text.class, IntWritable.class);
        Text term = new Text();
        IntWritable termIndex = new IntWritable();

        Iterator<TermEntry> termEntries = termInfo.getAllEntries();
        while (termEntries.hasNext()) {
          TermEntry termEntry = termEntries.next();
          term.set(termEntry.getTerm());
          termIndex.set(termEntry.getTermIdx());
          seqWriter.append(term, termIndex);
View Full Code Here

Examples of org.apache.mahout.utils.vectors.TermInfo

   
    IndexReader reader = DirectoryReader.open(directory);
  

    Weight weight = new TFIDF();
    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
   
    int numTerms = 0;
    for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
      it.next();
      numTerms++;
    }
    termDictionary = new String[numTerms];
    int i = 0;
    for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
      String term = it.next().getTerm();
      termDictionary[i] = term;
      System.out.println(i + " " + term);
      i++;
    }
View Full Code Here

Examples of org.apache.mahout.utils.vectors.TermInfo

    IndexReader reader = DirectoryReader.open(directory);
    System.out.println("Number of documents: \t"+reader.numDocs());
   
   
    Weight weight = new TFIDF();
    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
    Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content", termInfo,weight);
   
    int i = 0;
    for (Vector vector : iterable) {
      assertNotNull(vector);
View Full Code Here

Examples of org.apache.mahout.utils.vectors.TermInfo

  }

  public void testIterable() throws Exception {
    IndexReader reader = IndexReader.open(directory, true);
    Weight weight = new TFIDF();
    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
    VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
    LuceneIterable iterable = new LuceneIterable(reader, "id", "content", mapper);

    //TODO: do something more meaningful here
    for (Vector vector : iterable) {
View Full Code Here

Examples of org.apache.mahout.utils.vectors.TermInfo

          }
          int maxDFPercent = 99;
          if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
          }
          TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
          VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
          double norm = LuceneIterable.NO_NORMALIZING;
          if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if (power.equals("INF")) {
View Full Code Here

Examples of org.apache.mahout.utils.vectors.TermInfo

        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
          maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }

        TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
        VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);

        double norm = LuceneIterable.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
          String power = cmdLine.getValue(powerOpt).toString();
View Full Code Here

Examples of org.apache.mahout.utils.vectors.TermInfo

      writer.addDocument(doc);
    }
    writer.close();
    IndexReader reader = IndexReader.open(directory, true);
    Weight weight = new TFIDF();
    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
    VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
    Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content", mapper);

    int i = 0;
    for (Vector vector : iterable) {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.