Examples of edu.umd.cloud9.util.map.HMapII

edu.umd.cloud9.util.map.HMapII
Hash-based implementation of {@link MapII}.

    assertEquals(array2, array1);
  }


  @Test
  public void testSerialize5() throws IOException {
    HMapII hmap1 = new HMapII();
    hmap1.put(1, 22);
    hmap1.put(2, 5);
    hmap1.put(3, 10);


    Document doc1 = new Document(hmap1);
    assertEquals(doc1.getNumberOfTokens(), 37);
    assertEquals(doc1.getNumberOfTypes(), 3);
    assertEquals(doc1.getNumberOfTopics(), 0);
    assertEquals(doc1.getGamma(), null);


    double[] array1 = new double[2];
    array1[0] = 0.238573f;
    array1[1] = 1.59382f;


    doc1.setGamma(array1);
    for (int i = 0; i < doc1.getGamma().length; i++) {
      assertEquals(doc1.getGamma()[i], array1[i], PRECISION);
    }


    Document doc2 = Document.create(doc1.serialize());
    HMapII hmap2 = doc2.getContent();
    double[] array2 = doc2.getGamma();


    assertEquals(doc2.getNumberOfTokens(), doc1.getNumberOfTokens());
    assertEquals(doc2.getNumberOfTypes(), doc1.getNumberOfTypes());
    assertEquals(doc2.getNumberOfTopics(), doc1.getNumberOfTopics());
    assertEquals(hmap2.size(), hmap1.size());
    assertEquals(array2.length, array1.length);


    Iterator<Integer> itr = hmap2.keySet().iterator();
    while (itr.hasNext()) {
      int key = itr.next();
      assertEquals(hmap2.get(key), hmap1.get(key));
    }


    for (int i = 0; i < array2.length; i++) {
      assertEquals(array2[i], array1[i], PRECISION);
    }

View Full Code Here


    int numEntries = in.readInt();
    if (numEntries <= 0) {
      content = null;
    } else {
      content = new HMapII();
      for (int i = 0; i < numEntries; i++) {
        int id = in.readInt();
        int count = in.readInt();
        content.put(id, count);
        numberOfTokens += count;

View Full Code Here

      }
    }


    double[] logPhi = null;


    HMapII content = value.getContent();
    if (content == null) {
      System.err.println("Error: content was null for document " + key.toString());
      return;
    }


    // be careful when adjust this initial value
    int gammaUpdateIterationCount = 1;
    do {
      likelihoodPhi = 0;


      for (int i = 0; i < numberOfTopics; i++) {
        tempGamma[i] = Gamma.digamma(tempGamma[i]);
        updateLogGamma[i] = Math.log(alpha[i]);
      }


      itr = content.keySet().iterator();
      while (itr.hasNext()) {
        int termID = itr.next();
        // acquire the corresponding beta vector for this term
        if (logPhiTable.containsKey(termID)) {
          // reuse existing object
          logPhi = logPhiTable.get(termID);
        } else {
          logPhi = new double[numberOfTopics];
          logPhiTable.put(termID, logPhi);
        }


        int termCounts = content.get(termID);
        tempLogBeta = retrieveBeta(numberOfTopics, expectLogBeta, termID, numberOfTerms);


        likelihoodPhi += updatePhi(numberOfTopics, termCounts, tempLogBeta, tempGamma, logPhi,
            updateLogGamma);
      }


      for (int i = 0; i < numberOfTopics; i++) {
        tempGamma[i] = Math.exp(updateLogGamma[i]);
      }


      gammaUpdateIterationCount++;


      // send out heart-beat message
      if (Math.random() < 0.01) {
        reporter.incrCounter(ParameterCounter.DUMMY_COUNTER, 1);
      }
    } while (gammaUpdateIterationCount < maximumGammaIteration);


    // compute the sum of gamma vector
    double sumGamma = 0;
    double likelihoodGamma = 0;
    for (int i = 0; i < numberOfTopics; i++) {
      sumGamma += tempGamma[i];
      likelihoodGamma += Gamma.lngamma(tempGamma[i]);
    }
    likelihoodGamma -= Gamma.lngamma(sumGamma);
    double documentLogLikelihood = likelihoodAlpha + likelihoodGamma + likelihoodPhi;
    reporter.incrCounter(ParameterCounter.LOG_LIKELIHOOD,
        (long) (-documentLogLikelihood * Settings.DEFAULT_COUNTER_SCALE));


    double digammaSumGamma = Gamma.digamma(sumGamma);
    for (int i = 0; i < numberOfTopics; i++) {
      totalAlphaSufficientStatistics[i] += Gamma.digamma(tempGamma[i]) - digammaSumGamma;
    }


    outputCollector = output;


    if (!directEmit) {
      if (learning) {
        if (Runtime.getRuntime().freeMemory() < Settings.MEMORY_THRESHOLD) {
          itr = totalPhi.keySet().iterator();
          while (itr.hasNext()) {
            int termID = itr.next();
            logPhi = totalPhi.get(termID);
            for (int i = 0; i < numberOfTopics; i++) {
              outputValue.set(logPhi[i]);


              // a *positive* topic index indicates the output is a phi values
              outputKey.set(i + 1, termID);
              output.collect(outputKey, outputValue);
            }
          }
          totalPhi.clear();


          // for (int i = 0; i < numberOfTopics; i++) {
          // a *zero* topic index and a *positive* topic index indicates the output is a term for
          // alpha updating
          // outputKey.set(0, i + 1);
          // outputValue.set(totalAlphaSufficientStatistics[i]);
          // output.collect(outputKey, outputValue);
          // totalAlphaSufficientStatistics[i] = 0;
          // }
        }


        itr = content.keySet().iterator();
        while (itr.hasNext()) {
          int termID = itr.next();
          if (termID < Settings.TOP_WORDS_FOR_CACHING) {
            if (totalPhi.containsKey(termID)) {
              logPhi = logPhiTable.get(termID);
              tempLogBeta = totalPhi.get(termID);
              for (int i = 0; i < numberOfTopics; i++) {
                tempLogBeta[i] = LogMath.add(logPhi[i], tempLogBeta[i]);
              }
            } else {
              totalPhi.put(termID, logPhiTable.get(termID));
            }
          } else {
            logPhi = logPhiTable.get(termID);
            for (int i = 0; i < numberOfTopics; i++) {
              outputValue.set(logPhi[i]);


              // a *positive* topic index indicates the output is a phi values
              outputKey.set(i + 1, termID);
              output.collect(outputKey, outputValue);
            }
          }
        }
      }
    } else {
      if (learning) {
        itr = content.keySet().iterator();
        while (itr.hasNext()) {
          int termID = itr.next();
          // only get the phi's of current document
          logPhi = logPhiTable.get(termID);
          for (int i = 0; i < numberOfTopics; i++) {

View Full Code Here

        int numEntries = in.readInt();


        if (numEntries <= 0) {
          content[i] = null;
        } else {
          content[i] = new HMapII();
          for (int j = 0; j < numEntries; j++) {
            int id = in.readInt();
            int count = in.readInt();
            content[i].put(id, count);
            numberOfWords[i] += count;

View Full Code Here

      }


      // if cache is non-empty, a docnos file has been entered
      if (localFiles != null) {
        sLogger.setLevel(Level.INFO);
        samplesMap = new HMapII();
        try {
          LineReader reader = new LineReader(FileSystem.getLocal(conf).open(localFiles[0]));
          Text t = new Text();
          while (reader.readLine(t) != 0) {
            int docno = Integer.parseInt(t.toString());

View Full Code Here

      }


      // if cache is non-empty, a docnos file has been entered
      if (localFiles != null) {
        sLogger.setLevel(Level.INFO);
        samplesMap = new HMapII();
        try {
          LineReader reader = new LineReader(FileSystem.getLocal(conf).open(localFiles[0]));
          Text t = new Text();
          while (reader.readLine(t) != 0) {
            int docno = Integer.parseInt(t.toString());

View Full Code Here

    //Parse queries and find integer codes for the query terms.
    HMapIV<String> parsedQueries = QueryUtility.loadQueries(queryPath);
    HMapIV<int[]> queries = QueryUtility.queryToIntegerCode(env, parsedQueries);


    Set<Integer> termidHistory = Sets.newHashSet();
    HMapII docLengths = new HMapII();


    SpamPercentileScore spamScores = new SpamPercentileScore();
    spamScores.initialize(spamPath, fs);
    int[] newDocids = DocumentUtility.spamSortDocids(spamScores);


    Posting posting = new Posting();
    List<TermPositions> positions = Lists.newArrayList();
    Map<Integer, TermPositions> positionsMap = Maps.newHashMap();


    for(int qid: queries.keySet()) {
      for(int termid: queries.get(qid)) {
        if(!termidHistory.contains(termid)) {
          termidHistory.add(termid);
          PostingsList pl = env.getPostingsList(env.getTermFromId(termid));
          PostingsReader reader = pl.getPostingsReader();


          positions.clear();
          positionsMap.clear();
          int[] data = new int[pl.getDf()];
          int index = 0;
          while (reader.nextPosting(posting)) {
            data[index] = newDocids[posting.getDocno()];
            positionsMap.put(data[index], new TermPositions(reader.getPositions(), reader.getTf()));
            docLengths.put(data[index], env.getDocumentLength(posting.getDocno()));
            index++;
          }
          Arrays.sort(data);


          for(int i = 0; i < data.length; i++) {
            positions.add(positionsMap.get(data[i]));
          }


          output.writeInt(termid);
          output.writeInt(pl.getDf());
          CompressedPositionalPostings.newInstance(data, positions).write(output);
        }
      }
      LOGGER.info("Compressed query " + qid);
    }


    output.writeInt(-1);


    output.writeInt(docLengths.size());
    for(int docid: docLengths.keySet()) {
      output.writeInt(docid);
      output.writeInt(docLengths.get(docid));
    }


    output.close();
  }

View Full Code Here

          + "1-Path to the Counts Table: A text file consisting of one "
              + "<length-of-anchor-text>\t<number-of-instances-to-sample> record per line.)\n"
                  + "2-Integer: Minimum number of target documents\n"
                      + "3-Integer: Maximum number of target documents");
    }
    counts = new HMapII();
    minNumberTargets = Integer.parseInt(params[1]);
    maxNumberTargets = Integer.parseInt(params[2]);


    try {
      FSDataInputStream in = fs.open(new Path(params[0]));

View Full Code Here

  public void initialize(FileSystem fs, String... params) {
    if (params.length != 1) {
      throw new RuntimeException(toString() + ": Missing counts table (path to a text file consisting of one "
          + "<number-of-target-documents>\t<number-of-instances-to-sample> record per line.)");
    }
    counts = new HMapII();


    try {
      FSDataInputStream in = fs.open(new Path(params[0]));
      String next;
      String[] records;

View Full Code Here

                            (float) env.getDefaultDf(), (float) env.getDefaultCf());
  }


  private void preparePostings(String postingsPath) throws Exception {
    postings = new HMapIV<CompressedPositionalPostings>();
    dfs = new HMapII();
    docLengths = new HMapII();


    FSDataInputStream input = fs.open(new Path(postingsPath));
    int termid = input.readInt();
    while(termid != -1) {
      dfs.put(termid, input.readInt());

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of edu.umd.cloud9.util.map.HMapII

bak.pcj.IntIterator

cc.mrlda.DocumentTest

cc.mrlda.polylda.DocumentTest

edu.umd.cloud9.example.bfs.BfsNodeTest

edu.umd.cloud9.example.pagerank.PageRankNodeTest

edu.umd.cloud9.io.array.ArrayListOfIntsWritable

edu.umd.cloud9.io.fastutil.Int2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.Int2IntOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2IntOpenHashMapWritableTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.