Package gnu.trove

Examples of gnu.trove.TIntIntHashMap


  {
    long startTime = System.currentTimeMillis();
 
    int[] oneDocTopics = topicSequence.getFeatures();

    TIntIntHashMap currentTypeTopicCounts;
    int type, oldTopic, newTopic;
    double[] topicDistribution;
    double topicDistributionSum;
    int docLen = featureSequence.getLength();
    int adjustedValue;
    int[] topicIndices, topicCounts;

    double weight;
 
    // populate topic counts
    Arrays.fill(oneDocTopicCounts, 0);

    if (readjustTopicsAndStats) {
      for (int token = 0; token < docLen; token++) {
        oneDocTopicCounts[ oneDocTopics[token] ]++;
      }
    }

    // Iterate over the tokens (words) in the document
    for (int token = 0; token < docLen; token++) {
      type = featureSequence.getIndexAtPosition(token);
      oldTopic = oneDocTopics[token];
      currentTypeTopicCounts = typeTopicCounts[type];
      assert (currentTypeTopicCounts.size() != 0);

      if (readjustTopicsAndStats) {
        // Remove this token from all counts
        oneDocTopicCounts[oldTopic]--;
        adjustedValue = currentTypeTopicCounts.adjustOrPutValue(oldTopic, -1, -1);
        if (adjustedValue == 0) currentTypeTopicCounts.remove(oldTopic);
        else if (adjustedValue == -1) throw new IllegalStateException ("Token count in topic went negative.");
        tokensPerTopic[oldTopic]--;
      }

      // Build a distribution over topics for this token
      topicIndices = currentTypeTopicCounts.keys();
      topicCounts = currentTypeTopicCounts.getValues();
      topicDistribution = new double[topicIndices.length];
      // TODO Yipes, memory allocation in the inner loop!  But note that .keys and .getValues is doing this too.
      topicDistributionSum = 0;
      for (int i = 0; i < topicCounts.length; i++) {
        int topic = topicIndices[i];
View Full Code Here


                      boolean shouldSaveState,
                      boolean readjustTopicsAndStats /* currently ignored */) {

    int[] oneDocTopics = topicSequence.getFeatures();

    TIntIntHashMap currentTypeTopicCounts;
    int type, oldTopic, newTopic;
    double topicWeightsSum;
    int docLength = tokenSequence.getLength();

    //    populate topic counts
    TIntIntHashMap localTopicCounts = new TIntIntHashMap();
    for (int position = 0; position < docLength; position++) {
      localTopicCounts.adjustOrPutValue(oneDocTopics[position], 1, 1);
    }

    //    Initialize the topic count/beta sampling bucket
    double topicBetaMass = 0.0;
    for (int topic: localTopicCounts.keys()) {
      int n = localTopicCounts.get(topic);

      //      initialize the normalization constant for the (B * n_{t|d}) term
      topicBetaMass += beta * n /  (tokensPerTopic[topic] + betaSum)

      //      update the coefficients for the non-zero topics
      cachedCoefficients[topic] (alpha[topic] + n) / (tokensPerTopic[topic] + betaSum);
    }

    double topicTermMass = 0.0;

    double[] topicTermScores = new double[numTopics];
    int[] topicTermIndices;
    int[] topicTermValues;
    int i;
    double score;

    //  Iterate over the positions (words) in the document
    for (int position = 0; position < docLength; position++) {
      type = tokenSequence.getIndexAtPosition(position);
      oldTopic = oneDocTopics[position];

      currentTypeTopicCounts = typeTopicCounts[type];
      assert(currentTypeTopicCounts.get(oldTopic) >= 0);

      //  Remove this token from all counts.
      //   Note that we actually want to remove the key if it goes
      //    to zero, not set it to 0.
      if (currentTypeTopicCounts.get(oldTopic) == 1) {
        currentTypeTopicCounts.remove(oldTopic);
      }
      else {
        currentTypeTopicCounts.adjustValue(oldTopic, -1);
      }

      smoothingOnlyMass -= alpha[oldTopic] * beta /
        (tokensPerTopic[oldTopic] + betaSum);
      topicBetaMass -= beta * localTopicCounts.get(oldTopic) /
        (tokensPerTopic[oldTopic] + betaSum);
     
      if (localTopicCounts.get(oldTopic) == 1) {
        localTopicCounts.remove(oldTopic);
      }
      else {
        localTopicCounts.adjustValue(oldTopic, -1);
      }

      tokensPerTopic[oldTopic]--;
     
      smoothingOnlyMass += alpha[oldTopic] * beta /
        (tokensPerTopic[oldTopic] + betaSum);
      topicBetaMass += beta * localTopicCounts.get(oldTopic) /
        (tokensPerTopic[oldTopic] + betaSum);
     
      cachedCoefficients[oldTopic] =
        (alpha[oldTopic] + localTopicCounts.get(oldTopic)) /
        (tokensPerTopic[oldTopic] + betaSum);

      topicTermMass = 0.0;

      topicTermIndices = currentTypeTopicCounts.keys();
      topicTermValues = currentTypeTopicCounts.getValues();

      for (i=0; i < topicTermIndices.length; i++) {
        int topic = topicTermIndices[i];
        score =
          cachedCoefficients[topic] * topicTermValues[i];
        //        ((alpha[topic] + localTopicCounts.get(topic)) *
        //        topicTermValues[i]) /
        //        (tokensPerTopic[topic] + betaSum);
       
        //        Note: I tried only doing this next bit if
        //        score > 0, but it didn't make any difference,
        //        at least in the first few iterations.
       
        topicTermMass += score;
        topicTermScores[i] = score;
        //        topicTermIndices[i] = topic;
      }
      //      indicate that this is the last topic
      //      topicTermIndices[i] = -1;
     
      double sample = random.nextUniform() * (smoothingOnlyMass + topicBetaMass + topicTermMass);
      double origSample = sample;

//      Make sure it actually gets set
      newTopic = -1;

      if (sample < topicTermMass) {
        //topicTermCount++;

        i = -1;
        while (sample > 0) {
          i++;
          sample -= topicTermScores[i];
        }
        newTopic = topicTermIndices[i];

      }
      else {
        sample -= topicTermMass;

        if (sample < topicBetaMass) {
          //betaTopicCount++;

          sample /= beta;

          topicTermIndices = localTopicCounts.keys();
          topicTermValues = localTopicCounts.getValues();

          for (i=0; i < topicTermIndices.length; i++) {
            newTopic = topicTermIndices[i];

            sample -= topicTermValues[i] /
              (tokensPerTopic[newTopic] + betaSum);

            if (sample <= 0.0) {
              break;
            }
          }

        }
        else {
          //smoothingOnlyCount++;

          sample -= topicBetaMass;

          sample /= beta;

          for (int topic = 0; topic < numTopics; topic++) {
            sample -= alpha[topic] /
              (tokensPerTopic[topic] + betaSum);

            if (sample <= 0.0) {
              newTopic = topic;
              break;
            }
          }

        }

      }

      if (newTopic == -1) {
        System.err.println("LDAHyper sampling error: "+ origSample + " " + sample + " " + smoothingOnlyMass + " " +
            topicBetaMass + " " + topicTermMass);
        newTopic = numTopics-1; // TODO is this appropriate
        //throw new IllegalStateException ("LDAHyper: New topic not sampled.");
      }
      //assert(newTopic != -1);

      //      Put that new topic into the counts
      oneDocTopics[position] = newTopic;
      currentTypeTopicCounts.adjustOrPutValue(newTopic, 1, 1);

      smoothingOnlyMass -= alpha[newTopic] * beta /
        (tokensPerTopic[newTopic] + betaSum);
      topicBetaMass -= beta * localTopicCounts.get(newTopic) /
        (tokensPerTopic[newTopic] + betaSum);

      localTopicCounts.adjustOrPutValue(newTopic, 1, 1);
      tokensPerTopic[newTopic]++;

      //      update the coefficients for the non-zero topics
      cachedCoefficients[newTopic] =
        (alpha[newTopic] + localTopicCounts.get(newTopic)) /
        (tokensPerTopic[newTopic] + betaSum);

      smoothingOnlyMass += alpha[newTopic] * beta /
        (tokensPerTopic[newTopic] + betaSum);
      topicBetaMass += beta * localTopicCounts.get(newTopic) /
        (tokensPerTopic[newTopic] + betaSum);

      assert(currentTypeTopicCounts.get(newTopic) >= 0);

    }

    //    Clean up our mess: reset the coefficients to values with only
    //    smoothing. The next doc will update its own non-zero topics...
    for (int topic: localTopicCounts.keys()) {
      cachedCoefficients[topic] =
        alpha[topic] / (tokensPerTopic[topic] + betaSum);
    }

    if (shouldSaveState) {
      //      Update the document-topic count histogram,
      //      for dirichlet estimation
      docLengthCounts[ docLength ]++;
      for (int topic: localTopicCounts.keys()) {
        topicDocCounts[topic][ localTopicCounts.get(topic) ]++;
      }
    }
  }
View Full Code Here

  protected StateLabelMap map;
  protected TIntArrayList cache;

  public TwoLabelGEConstraints() {
    this.constraintsList = new ArrayList<TwoLabelGEConstraint>();
    this.constraintsMap = new TIntIntHashMap();
    this.map = null;
    this.cache = new TIntArrayList();
  }
View Full Code Here

  {
    //System.out.println ("HashedSparseVector setIndex2Location indices.length="+indices.length+" maxindex="+indices[indices.length-1]);
    assert (index2location == null);
    assert (indices.length > 0);
    this.maxIndex = indices[indices.length - 1];
    this.index2location = new TIntIntHashMap (numLocations ());
    //index2location.setDefaultValue (-1);
    for (int i = 0; i < indices.length; i++)
      index2location.put (indices[i], i);
  }
View Full Code Here

    inTop50 = 0;
    inTop20 = 0;
    inTop10 = 0;
    inTop1 = 0;
    notInTop50 = 0;
    rankRelevantDocument = new TIntIntHashMap();
    int queryCounter = -1;
    int previousQueryId = -1;
    try {
      final BufferedReader br = Files.openFileReader(resultFilename);
      int firstSpaceIndex;
View Full Code Here

        //logger.warn("Using old-fashioned number of terms strategy. Please consider setting invertedfile.processpointers for forward compatible use");
      }

      while (i < numberOfUniqueTerms) {
        iterationCounter++;
        TIntIntHashMap codesHashMap = null;
        TIntArrayList[][] tmpStorage = null;
        IntLongTuple results = null;

        //logger.info("Iteration " + iterationCounter+ iteration_message_suffix);

        // traverse the lexicon looking to determine the first N() terms
        // this can be done two ways: for the first X terms
        // OR for the first Y pointers

        startProcessingLexicon = System.currentTimeMillis();

        if (numberOfPointersPerIteration > 0) {// we've been configured
                            // to run with a given
                            // number of pointers
          //logger.info("Scanning lexicon for "
//              + numberOfPointersPerIteration + " pointers");
          /*
           * this is less speed efficient, as we have no way to guess
           * how many terms it will take to fill the given number of
           * pointers. The advantage is that memory consumption is
           * more directly correlated to number of pointers than
           * number of terms, so when indexing tricky collections, it
           * is easier to find a number of pointers that can fit in
           * memory
           */

          codesHashMap = new TIntIntHashMap();
          ArrayList<TIntArrayList[]> tmpStorageStorage = new ArrayList<TIntArrayList[]>();
          results = scanLexiconForPointers(
              numberOfPointersPerIteration, lexiconStream,
              codesHashMap, tmpStorageStorage);
          tmpStorage = (TIntArrayList[][]) tmpStorageStorage
              .toArray(new TIntArrayList[0][0]);

        } else// we're running with a given number of terms
        {
          tmpStorage = new TIntArrayList[processTerms][];
          codesHashMap = new TIntIntHashMap(processTerms);
          results = scanLexiconForTerms(processTerms, lexiconStream,
              codesHashMap, tmpStorage);
        }

        processTerms = results.Terms;// no of terms to process on
                        // this iteration
        numberOfPointersThisIteration = results.Pointers;
        numberOfPointers += results.Pointers;// no of pointers to
                            // process on this
                            // iteration
        i += processTerms;

        if (processTerms == 0)
          break;
        //logger.info("time to process part of lexicon: "  + ((System.currentTimeMillis() - startProcessingLexicon) / 1000D));

        InvertedIndexBuilder.displayMemoryUsage(r);

        // Scan the direct file looking for those terms
        startTraversingDirectFile = System.currentTimeMillis();
        traverseDirectFile(codesHashMap, tmpStorage);
        //logger.info("time to traverse direct file: "+ ((System.currentTimeMillis() - startTraversingDirectFile) / 1000D));

        InvertedIndexBuilder.displayMemoryUsage(r);

        // write the inverted file for this part of the lexicon, ie
        // processTerms number of terms
        startWritingInvertedFile = System.currentTimeMillis();
        numberOfTokens += writeInvertedFilePart(dos, tmpStorage,
            processTerms);
        //logger.info("time to write inverted file: "  + ((System.currentTimeMillis() - startWritingInvertedFile) / 1000D));

        InvertedIndexBuilder.displayMemoryUsage(r);

        //logger.info("time to perform one iteration: "+ ((System.currentTimeMillis() - startProcessingLexicon) / 1000D));
        //logger.info("number of pointers processed: "+ numberOfPointersThisIteration);

        tmpStorage = null;
        codesHashMap.clear();
        codesHashMap = null;
      }

      //logger.info("Finished generating inverted file, rewriting lexicon");
//      this.numberOfDocuments = numberOfDocuments;
View Full Code Here

      }
   
      while(i<_numberOfUniqueTerms)
      {
        iterationCounter++;
        TIntIntHashMap codesHashMap = null;
        TIntArrayList[][] tmpStorage = null;
        IntLongTuple results = null;
       
        //logger.info("Iteration "+iterationCounter+iteration_message_suffix);
       
        //traverse the lexicon looking to determine the first N() terms
        //this can be done two ways: for the first X terms
        //OR for the first Y pointers
        //ie either N=X, or N=fn(Y)
       
        startProcessingLexicon = System.currentTimeMillis();
       
        if (numberOfPointersPerIteration > 0)
        {//we've been configured to run with a given number of pointers
          if (logger.isDebugEnabled())
            logger.debug("Scanning lexicon for "+ numberOfPointersPerIteration + " pointers");
       
          /* this is less speed efficient, as we have no way to guess how many
           * terms it will take to fill the given number of pointers.
           * The advantage is that memory consumption is more directly correlated
           * to number of pointers than number of terms, so when indexing tricky
           * collections, it is easier to find a number of pointers that can fit
           * in memory */
          
          codesHashMap = new TIntIntHashMap();
          ArrayList<TIntArrayList[]> tmpStorageStorage = new ArrayList<TIntArrayList[]>();
          results = scanLexiconForPointers(
            numberOfPointersPerIteration,
            lexiconStream,
            codesHashMap,
            tmpStorageStorage);
          tmpStorage = (TIntArrayList[][]) tmpStorageStorage.toArray(
            new TIntArrayList[0][0]);
         
        }
        else//we're running with a given number of terms
        {
          if (logger.isDebugEnabled())
            logger.debug("Scanning lexicon for " + processTerms+" terms");
          tmpStorage = new TIntArrayList[processTerms][];
          codesHashMap = new TIntIntHashMap(processTerms);
          results = scanLexiconForTerms(
            processTerms,
            lexiconStream,
            codesHashMap,
            tmpStorage);
        }
       
        processTerms = results.Terms;//no of terms to process on this iteration
        numberOfPointersThisIteration = results.Pointers;
        _numberOfPointers += results.Pointers;//no of pointers to process on this iteration
        logger.debug("Selected " + results.Terms + " terms, " + results.Pointers + " pointers for this iteration");
       
        if (results.Terms == 0)
        {
          //logger.warn("No terms found this iteration - presuming end of iteration cycle (perhaps some lexicon terms are empty)");
          break;
        }
        i += processTerms;
       
        if (logger.isDebugEnabled())
          logger.debug("time to process part of lexicon: " + ((System.currentTimeMillis()- startProcessingLexicon) / 1000D));
       
       
        displayMemoryUsage(r)
       
        //Scan the direct file looking for those terms
        startTraversingDirectFile = System.currentTimeMillis();
        traverseDirectFile(codesHashMap, tmpStorage);
        if (logger.isDebugEnabled())
          logger.debug("time to traverse direct file: " + ((System.currentTimeMillis() - startTraversingDirectFile) / 1000D));
       
        displayMemoryUsage(r);     
 
        //write the inverted file for this part of the lexicon, ie processTerms number of terms
        startWritingInvertedFile = System.currentTimeMillis();
        _numberOfTokens += writeInvertedFilePart(dos, tmpStorage, processTerms);
        if (logger.isDebugEnabled())
          logger.debug("time to write inverted file: "
           + ((System.currentTimeMillis()- startWritingInvertedFile) / 1000D));
       
             
        displayMemoryUsage(r);
 
        if (logger.isDebugEnabled()) {
          logger.debug(
              "time to perform one iteration: "
                + ((System.currentTimeMillis() - startProcessingLexicon)
                  / 1000D));
          logger.debug(
            "number of pointers processed: "
              + numberOfPointersThisIteration)
        }
       
       
        tmpStorage  = null;
        codesHashMap.clear();
        codesHashMap = null;
      }
     
     
     
View Full Code Here

     
      final int fieldCount = srcFieldCount1;
     
      //creating a new map between new and old term codes
      if (keepTermCodeMap)
        termcodeHashmap = new TIntIntHashMap();

      //setting the input streams
      Iterator<Map.Entry<String,LexiconEntry>> lexInStream1 =
        (Iterator<Map.Entry<String,LexiconEntry>>)srcIndex1.getIndexStructureInputStream("lexicon");
      Iterator<Map.Entry<String,LexiconEntry>> lexInStream2 =
View Full Code Here

   
    int blockLengthIndex = -1;
    //BlockInvertedIndex invIndex = (BlockInvertedIndex)inv;
    int[][][] postings = new int[phraseLength][][];
    for (int i = 0; i < phraseLength; i++) {
      docidsMap[i] = new TIntIntHashMap();
      String t = ((SingleTermQuery) phraseTerms.get(i)).getTerm();
      if (terms.getStatistics(t) == null)
      {
        LexiconEntry le = index.getLexicon().getLexiconEntry(t);
        if (le == null)
View Full Code Here

  }
 
  public void checkInvertedIndexStream(Index index, int[] documentLengths) throws Exception
  {
    final int numDocs = index.getCollectionStatistics().getNumberOfDocuments();
    TIntIntHashMap calculatedDocLengths = new TIntIntHashMap();
    InvertedIndexInputStream iiis = (InvertedIndexInputStream) index.getIndexStructureInputStream("inverted");
    assertNotNull(iiis);
    int ithTerm = -1;
    while(iiis.hasNext())
    {
      ithTerm++;
      final IterablePosting ip = iiis.getNextPostings();
      int count = 0;
      final int expected = iiis.getNumberOfCurrentPostings();
      while(ip.next() != IterablePosting.EOL)
      {
        //System.err.println("Got id " + ip.getId());
        assertTrue("Got too big a docid ("+ip.getId()+") from inverted index input stream for term at index " + ithTerm, ip.getId() < numDocs);
        count++;
        calculatedDocLengths.adjustOrPutValue(ip.getId(), ip.getFrequency(), ip.getFrequency());
      }
      assertEquals(expected, count);
    }
    assertEquals("Number of documents is unexpected,", documentLengths.length - countZero(documentLengths), calculatedDocLengths.size());
    long tokens = 0;
    for(int docid : calculatedDocLengths.keys())
    {
      assertEquals("Document length for docid "+docid+" is unexpected,", documentLengths[docid], calculatedDocLengths.get(docid));
      tokens += calculatedDocLengths.get(docid);
    }
    assertEquals("Number of tokens is unexpected,", StaTools.sum(documentLengths), tokens);
  }
View Full Code Here

TOP

Related Classes of gnu.trove.TIntIntHashMap

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.