Package edu.umd.cloud9.util.map

Examples of edu.umd.cloud9.util.map.HMapII


    assertEquals(array2, array1);
  }

  @Test
  public void testSerialize5() throws IOException {
    HMapII hmap1 = new HMapII();
    hmap1.put(1, 22);
    hmap1.put(2, 5);
    hmap1.put(3, 10);

    Document doc1 = new Document(hmap1);
    assertEquals(doc1.getNumberOfTokens(), 37);
    assertEquals(doc1.getNumberOfTypes(), 3);
    assertEquals(doc1.getNumberOfTopics(), 0);
    assertEquals(doc1.getGamma(), null);

    double[] array1 = new double[2];
    array1[0] = 0.238573f;
    array1[1] = 1.59382f;

    doc1.setGamma(array1);
    for (int i = 0; i < doc1.getGamma().length; i++) {
      assertEquals(doc1.getGamma()[i], array1[i], PRECISION);
    }

    Document doc2 = Document.create(doc1.serialize());
    HMapII hmap2 = doc2.getContent();
    double[] array2 = doc2.getGamma();

    assertEquals(doc2.getNumberOfTokens(), doc1.getNumberOfTokens());
    assertEquals(doc2.getNumberOfTypes(), doc1.getNumberOfTypes());
    assertEquals(doc2.getNumberOfTopics(), doc1.getNumberOfTopics());
    assertEquals(hmap2.size(), hmap1.size());
    assertEquals(array2.length, array1.length);

    Iterator<Integer> itr = hmap2.keySet().iterator();
    while (itr.hasNext()) {
      int key = itr.next();
      assertEquals(hmap2.get(key), hmap1.get(key));
    }

    for (int i = 0; i < array2.length; i++) {
      assertEquals(array2[i], array1[i], PRECISION);
    }
View Full Code Here


    int numEntries = in.readInt();
    if (numEntries <= 0) {
      content = null;
    } else {
      content = new HMapII();
      for (int i = 0; i < numEntries; i++) {
        int id = in.readInt();
        int count = in.readInt();
        content.put(id, count);
        numberOfTokens += count;
View Full Code Here

      }
    }

    double[] logPhi = null;

    HMapII content = value.getContent();
    if (content == null) {
      System.err.println("Error: content was null for document " + key.toString());
      return;
    }

    // be careful when adjust this initial value
    int gammaUpdateIterationCount = 1;
    do {
      likelihoodPhi = 0;

      for (int i = 0; i < numberOfTopics; i++) {
        tempGamma[i] = Gamma.digamma(tempGamma[i]);
        updateLogGamma[i] = Math.log(alpha[i]);
      }

      itr = content.keySet().iterator();
      while (itr.hasNext()) {
        int termID = itr.next();
        // acquire the corresponding beta vector for this term
        if (logPhiTable.containsKey(termID)) {
          // reuse existing object
          logPhi = logPhiTable.get(termID);
        } else {
          logPhi = new double[numberOfTopics];
          logPhiTable.put(termID, logPhi);
        }

        int termCounts = content.get(termID);
        tempLogBeta = retrieveBeta(numberOfTopics, expectLogBeta, termID, numberOfTerms);

        likelihoodPhi += updatePhi(numberOfTopics, termCounts, tempLogBeta, tempGamma, logPhi,
            updateLogGamma);
      }

      for (int i = 0; i < numberOfTopics; i++) {
        tempGamma[i] = Math.exp(updateLogGamma[i]);
      }

      gammaUpdateIterationCount++;

      // send out heart-beat message
      if (Math.random() < 0.01) {
        reporter.incrCounter(ParameterCounter.DUMMY_COUNTER, 1);
      }
    } while (gammaUpdateIterationCount < maximumGammaIteration);

    // compute the sum of gamma vector
    double sumGamma = 0;
    double likelihoodGamma = 0;
    for (int i = 0; i < numberOfTopics; i++) {
      sumGamma += tempGamma[i];
      likelihoodGamma += Gamma.lngamma(tempGamma[i]);
    }
    likelihoodGamma -= Gamma.lngamma(sumGamma);
    double documentLogLikelihood = likelihoodAlpha + likelihoodGamma + likelihoodPhi;
    reporter.incrCounter(ParameterCounter.LOG_LIKELIHOOD,
        (long) (-documentLogLikelihood * Settings.DEFAULT_COUNTER_SCALE));

    double digammaSumGamma = Gamma.digamma(sumGamma);
    for (int i = 0; i < numberOfTopics; i++) {
      totalAlphaSufficientStatistics[i] += Gamma.digamma(tempGamma[i]) - digammaSumGamma;
    }

    outputCollector = output;

    if (!directEmit) {
      if (learning) {
        if (Runtime.getRuntime().freeMemory() < Settings.MEMORY_THRESHOLD) {
          itr = totalPhi.keySet().iterator();
          while (itr.hasNext()) {
            int termID = itr.next();
            logPhi = totalPhi.get(termID);
            for (int i = 0; i < numberOfTopics; i++) {
              outputValue.set(logPhi[i]);

              // a *positive* topic index indicates the output is a phi values
              outputKey.set(i + 1, termID);
              output.collect(outputKey, outputValue);
            }
          }
          totalPhi.clear();

          // for (int i = 0; i < numberOfTopics; i++) {
          // a *zero* topic index and a *positive* topic index indicates the output is a term for
          // alpha updating
          // outputKey.set(0, i + 1);
          // outputValue.set(totalAlphaSufficientStatistics[i]);
          // output.collect(outputKey, outputValue);
          // totalAlphaSufficientStatistics[i] = 0;
          // }
        }

        itr = content.keySet().iterator();
        while (itr.hasNext()) {
          int termID = itr.next();
          if (termID < Settings.TOP_WORDS_FOR_CACHING) {
            if (totalPhi.containsKey(termID)) {
              logPhi = logPhiTable.get(termID);
              tempLogBeta = totalPhi.get(termID);
              for (int i = 0; i < numberOfTopics; i++) {
                tempLogBeta[i] = LogMath.add(logPhi[i], tempLogBeta[i]);
              }
            } else {
              totalPhi.put(termID, logPhiTable.get(termID));
            }
          } else {
            logPhi = logPhiTable.get(termID);
            for (int i = 0; i < numberOfTopics; i++) {
              outputValue.set(logPhi[i]);

              // a *positive* topic index indicates the output is a phi values
              outputKey.set(i + 1, termID);
              output.collect(outputKey, outputValue);
            }
          }
        }
      }
    } else {
      if (learning) {
        itr = content.keySet().iterator();
        while (itr.hasNext()) {
          int termID = itr.next();
          // only get the phi's of current document
          logPhi = logPhiTable.get(termID);
          for (int i = 0; i < numberOfTopics; i++) {
View Full Code Here

        int numEntries = in.readInt();

        if (numEntries <= 0) {
          content[i] = null;
        } else {
          content[i] = new HMapII();
          for (int j = 0; j < numEntries; j++) {
            int id = in.readInt();
            int count = in.readInt();
            content[i].put(id, count);
            numberOfWords[i] += count;
View Full Code Here

      }

      // if cache is non-empty, a docnos file has been entered
      if (localFiles != null) {
        sLogger.setLevel(Level.INFO);
        samplesMap = new HMapII();
        try {
          LineReader reader = new LineReader(FileSystem.getLocal(conf).open(localFiles[0]));
          Text t = new Text();
          while (reader.readLine(t) != 0) {
            int docno = Integer.parseInt(t.toString());
View Full Code Here

      }

      // if cache is non-empty, a docnos file has been entered
      if (localFiles != null) {
        sLogger.setLevel(Level.INFO);
        samplesMap = new HMapII();
        try {
          LineReader reader = new LineReader(FileSystem.getLocal(conf).open(localFiles[0]));
          Text t = new Text();
          while (reader.readLine(t) != 0) {
            int docno = Integer.parseInt(t.toString());
View Full Code Here

    //Parse queries and find integer codes for the query terms.
    HMapIV<String> parsedQueries = QueryUtility.loadQueries(queryPath);
    HMapIV<int[]> queries = QueryUtility.queryToIntegerCode(env, parsedQueries);

    Set<Integer> termidHistory = Sets.newHashSet();
    HMapII docLengths = new HMapII();

    SpamPercentileScore spamScores = new SpamPercentileScore();
    spamScores.initialize(spamPath, fs);
    int[] newDocids = DocumentUtility.spamSortDocids(spamScores);

    Posting posting = new Posting();
    List<TermPositions> positions = Lists.newArrayList();
    Map<Integer, TermPositions> positionsMap = Maps.newHashMap();

    for(int qid: queries.keySet()) {
      for(int termid: queries.get(qid)) {
        if(!termidHistory.contains(termid)) {
          termidHistory.add(termid);
          PostingsList pl = env.getPostingsList(env.getTermFromId(termid));
          PostingsReader reader = pl.getPostingsReader();

          positions.clear();
          positionsMap.clear();
          int[] data = new int[pl.getDf()];
          int index = 0;
          while (reader.nextPosting(posting)) {
            data[index] = newDocids[posting.getDocno()];
            positionsMap.put(data[index], new TermPositions(reader.getPositions(), reader.getTf()));
            docLengths.put(data[index], env.getDocumentLength(posting.getDocno()));
            index++;
          }
          Arrays.sort(data);

          for(int i = 0; i < data.length; i++) {
            positions.add(positionsMap.get(data[i]));
          }

          output.writeInt(termid);
          output.writeInt(pl.getDf());
          CompressedPositionalPostings.newInstance(data, positions).write(output);
        }
      }
      LOGGER.info("Compressed query " + qid);
    }

    output.writeInt(-1);

    output.writeInt(docLengths.size());
    for(int docid: docLengths.keySet()) {
      output.writeInt(docid);
      output.writeInt(docLengths.get(docid));
    }

    output.close();
  }
View Full Code Here

          + "1-Path to the Counts Table: A text file consisting of one "
              + "<length-of-anchor-text>\t<number-of-instances-to-sample> record per line.)\n"
                  + "2-Integer: Minimum number of target documents\n"
                      + "3-Integer: Maximum number of target documents");
    }
    counts = new HMapII();
    minNumberTargets = Integer.parseInt(params[1]);
    maxNumberTargets = Integer.parseInt(params[2]);

    try {
      FSDataInputStream in = fs.open(new Path(params[0]));
View Full Code Here

  public void initialize(FileSystem fs, String... params) {
    if (params.length != 1) {
      throw new RuntimeException(toString() + ": Missing counts table (path to a text file consisting of one "
          + "<number-of-target-documents>\t<number-of-instances-to-sample> record per line.)");
    }
    counts = new HMapII();

    try {
      FSDataInputStream in = fs.open(new Path(params[0]));
      String next;
      String[] records;
View Full Code Here

                            (float) env.getDefaultDf(), (float) env.getDefaultCf());
  }

  private void preparePostings(String postingsPath) throws Exception {
    postings = new HMapIV<CompressedPositionalPostings>();
    dfs = new HMapII();
    docLengths = new HMapII();

    FSDataInputStream input = fs.open(new Path(postingsPath));
    int termid = input.readInt();
    while(termid != -1) {
      dfs.put(termid, input.readInt());
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.util.map.HMapII

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.