Examples of edu.umd.cloud9.collection.wikipedia.WikipediaPage

Package edu.umd.cloud9.collection.wikipedia

Examples of edu.umd.cloud9.collection.wikipedia.WikipediaPage

edu.umd.cloud9.util.map.HMapII
A page from Wikipedia. @author Jimmy Lin @author Peter Exner

      int e = entry.getRightElement();
      String eTerm = eVocab_f2e.get(e);


      //      LOG.info("Pr("+eTerm+"|"+token+")="+probEF);


      if (probEF > 0 && e > 0 && !docLangTokenizer.isStopWord(eTerm) && (translateOnly == null || !translateOnly.equals("indri") || indriPuncPattern.matcher(eTerm).matches()) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {      
        // assuming our bilingual dictionary is learned from normally segmented text, but we want to use bigram tokenizer for CLIR purposes
        // then we need to convert the translations of each source token into a sequence of bigrams
        // we can distribute the translation probability equally to the each bigram
        if (bigramSegment) {
          String[] eTokens = docLangTokenizer.processContent(eTerm);

View Full Code Here

    }


    // in SCFG rule such as a b X1 X2 c --> X1 d e X2 f, we want to find out the src/trg tokens that are aligned to some trg/src token, ignoring the X variable
    // we can then decide if we want to include it as a multi-token phrase in our query representation based on various heuristics (e.g., only include if no X in between of tokens)
    String fPhrase = "";
    ArrayListOfInts sourceTokenIds = new ArrayListOfInts();      
    ArrayListOfInts targetTokenIds = new ArrayListOfInts();
    int f=0;
    for (; f < lhs.length; f++) {
      String fTerm = lhs[f];
      if (queryLangTokenizer.isStopWord(fTerm) || fTerm.matches("\\[X,\\d+\\]") || fTerm.matches("<s>") || fTerm.matches("</s>")) {
        continue;
      }


      srcTokenCnt.increment(fTerm);
      sourceTokenIds.add(f);


      ArrayListOfInts ids;
      if (isPassThrough){
        ids = new ArrayListOfInts();
        ids.add(0);
      }else {
        ids = one2manyAlign.get(f);
      }


      if (ids == null || (isOne2Many == 0 && ids.size() > 1)) {
        continue;
      }


      // find phrase in LHS and match to phrase in RHS
      if (isMany2Many) {
        fPhrase += fTerm + " ";
        targetTokenIds = targetTokenIds.mergeNoDuplicates(ids);        
      }


      String eTerm = null;
      for (int e : ids) {
        eTerm = rhs[e];


        // assumption: if this is pass-through rule, re-stem token in doc-language
        if (isPassThrough || (unknownWords != null && unknownWords.contains(fTerm))) {
          eTerm = stemmed2Stemmed.get(eTerm);
        }


        if (eTerm == null || docLangTokenizer.isStopWord(eTerm)) {
          //          LOG.info("Skipped trg token " + eTerm);
          eTerm = null;
          continue;      
        }
        bagOfTargetTokens.add(eTerm);
        if (isOne2Many <= 1) {
          if (probDist.containsKey(fTerm)) {
            HMapSFW eToken2Prob = probDist.get(fTerm);
            eToken2Prob.increment(eTerm, weight);
          }else {
            HMapSFW eToken2Prob = new HMapSFW();
            eToken2Prob.put(eTerm, weight);
            probDist.put(fTerm, eToken2Prob);
          }
        }
      }


      if (isOne2Many == 2) {
        // if ids.size() > 1 eTerm is a multi-token expression
        // even if eTerm is overwritten here, we need to do above loop to update bagOfTargetTokens
        if (ids.size() > 1) {
          eTerm = isConsecutiveWithStopwords(ids, rhs, docLangTokenizer);     // <---- heuristic
        }


        // no proper translation on target-side (e.g., stopword OR non-consecutive multi-word translation), let's skip
        if (eTerm == null) {

View Full Code Here

      String[] alPair = alignment.split("-");
      int f = Integer.parseInt(alPair[0]);
      int e = Integer.parseInt(alPair[1]);


      if(!one2manyAlign.containsKey(f)){
        one2manyAlign.put(f, new ArrayListOfInts());  
      }
      one2manyAlign.get(f).add(e);
    }


    // for each source token id, sort ids of its translations in ascending order
    for(Integer f : one2manyAlign.keySet()) {
      ArrayListOfInts lst = one2manyAlign.get(f);
      lst.sort();
      one2manyAlign.put(f, lst);
    }


    return one2manyAlign;
  }

View Full Code Here


      // Remember, token position is numbered started from one...
      if (positions.containsKey(term)) {
        positions.get(term).add(i + 1);
      } else {
        ArrayListOfInts l = new ArrayListOfInts();
        l.add(i + 1);
        positions.put(term, l);
      }
    }


    int doclength = 0;
    Iterator<Map.Entry<String, ArrayListOfInts>> it = positions.entrySet().iterator();
    Map.Entry<String, ArrayListOfInts> e;
    ArrayListOfInts positionsList;
    while (it.hasNext()) {
      e = it.next();
      positionsList = e.getValue();


      // We're storing tfs as shorts, so check for overflow...
      if (positionsList.size() >= TF_CUT) {
        // There are a few ways to handle this... If we're getting such a high tf, then it most
        // likely means that this is a junk doc.
        LOG.warn("Error: tf of " + e.getValue()
            + " will overflow max short value. docno=" + doc.getDocid() + ", term="
            + e.getKey());
        it.remove();
      } else {
        positionsList.trimToSize();
        doclength += positionsList.size();
      }
    }


    if ( positions.size() == 0 ) {
      return positions;
    }


    positions.put("", new ArrayListOfInts(new int[] { doclength }));
    return positions;
  }

View Full Code Here




  public static void addToTable(int curIndex, TreeSet<PairOfFloatString> topTrans, float cumProb, TTable_monolithic_IFAs table, 
      Vocab trgVocab, float cumProbThreshold, HookaStats stats) {
    List<Integer> sortedIndices = new ArrayList<Integer>();
    HMapIF index2ProbMap = new HMapIF();


    float sumOfProbs = 0.0f;    //only extract the top K<15 if the mass prob. exceeds MAX_probThreshold
    while(!topTrans.isEmpty() && sumOfProbs < cumProbThreshold){
      PairOfFloatString e = topTrans.pollLast();
      String term = e.getRightElement();
      float pr = e.getLeftElement()/cumProb;    // normalize
      logger.debug(term+"-->"+pr);
      int trgIndex = trgVocab.addOrGet(term);
      sumOfProbs += e.getLeftElement();         // keep track of unnormalized cumulative prob for determining cutoff
      sortedIndices.add(trgIndex);
      index2ProbMap.put(trgIndex, pr);
    }


    // to enable faster access with binary search, we sort entries by vocabulary index.
    Collections.sort(sortedIndices);
    int numEntries = sortedIndices.size();


    // for statistics only
    stats.update(numEntries, sumOfProbs);


    // write translation list to TTable object
    int[] indices = new int[numEntries];
    float[] probs = new float[numEntries];
    int i=0;
    for(int sortedIndex : sortedIndices){
      indices[i]=sortedIndex;
      probs[i]=index2ProbMap.get(sortedIndex);
      i++;
    }      
    table.set(curIndex, new IndexedFloatArray(indices, probs, true));
  }

View Full Code Here

    DocumentVectorSlidingWindow generator = new DocumentVectorSlidingWindow(env, fs);


    //Parse queries, judgemnts and features
    HMapIV<String> parsedQueries = QueryUtility.loadQueries(queryPath);
    HMapIV<int[]> queries = QueryUtility.queryToIntegerCode(env, parsedQueries);
    HMapIF idfs = QueryUtility.loadIdf(env, parsedQueries);
    HMapIF cfs = QueryUtility.loadCf(env, parsedQueries);
    HMapIV<int[]> qrels = QrelUtility.parseQrelsFromTabDelimited(qrelPath);
    Map<String, Feature> featuresMap = FeatureUtility.parseFeatures(featurePath);
    Feature[] features = new Feature[featuresMap.size()];
    int index = 0;
    for(String key: featuresMap.keySet()) {

View Full Code Here

    DocumentVectorOnTheFlyIndexing generator = new DocumentVectorOnTheFlyIndexing(env, fs);


    //Parse queries, judgemnts and features
    HMapIV<String> parsedQueries = QueryUtility.loadQueries(queryPath);
    HMapIV<int[]> queries = QueryUtility.queryToIntegerCode(env, parsedQueries);
    HMapIF idfs = QueryUtility.loadIdf(env, parsedQueries);
    HMapIF cfs = QueryUtility.loadCf(env, parsedQueries);
    HMapIV<int[]> qrels = QrelUtility.parseQrelsFromTabDelimited(qrelPath);
    Map<String, Feature> featuresMap = FeatureUtility.parseFeatures(featurePath);
    Feature[] features = new Feature[featuresMap.size()];
    int index = 0;
    for(String key: featuresMap.keySet()) {

View Full Code Here

    RankAndFeaturesSmallAdaptive generator = new RankAndFeaturesSmallAdaptive(env, fs);


    //Parse queries and find integer codes for the query terms.
    HMapIV<String> parsedQueries = QueryUtility.loadQueries(queryPath);
    HMapIV<int[]> queries = QueryUtility.queryToIntegerCode(env, parsedQueries);
    HMapIF idfs = QueryUtility.loadIdf(env, parsedQueries);
    HMapIF cfs = QueryUtility.loadCf(env, parsedQueries);
    HMapIV<int[]> qrels = QrelUtility.parseQrelsFromTabDelimited(qrelPath);
    Map<String, Feature> featuresMap = FeatureUtility.parseFeatures(featurePath);
    Feature[] features = new Feature[featuresMap.size()];
    int index = 0;
    for(String key: featuresMap.keySet()) {

View Full Code Here




  public static void addToTable(int curIndex, TreeSet<PairOfFloatString> topTrans, float cumProb, TTable_monolithic_IFAs table, Vocab trgVocab, 
      float cumProbThreshold, HookaStats stats) {
    List<Integer> sortedIndices = new ArrayList<Integer>();
    HMapIF index2ProbMap = new HMapIF();


    float sumOfProbs = 0.0f;    //only extract the top K<15 if the mass prob. exceeds MAX_probThreshold
    while(!topTrans.isEmpty() && sumOfProbs < cumProbThreshold){
      PairOfFloatString e = topTrans.pollLast();
      String term = e.getRightElement();
      float pr = e.getLeftElement()/cumProb;    // normalize
      logger.debug(term+"-->"+pr);
      int trgIndex = trgVocab.addOrGet(term);
      sumOfProbs += e.getLeftElement();         // keep track of unnormalized cumulative prob for determining cutoff
      sortedIndices.add(trgIndex);
      index2ProbMap.put(trgIndex, pr);
    }


    // to enable faster access with binary search, we sort entries by vocabulary index.
    Collections.sort(sortedIndices);
    int numEntries = sortedIndices.size();


    // for statistics only
    stats.update(numEntries, sumOfProbs);


    // write translation list to TTable object
    int[] indices = new int[numEntries];
    float[] probs = new float[numEntries];
    int i=0;
    for(int sortedIndex : sortedIndices){
      indices[i]=sortedIndex;
      probs[i]=index2ProbMap.get(sortedIndex);
      i++;
    }      
    table.set(curIndex, new IndexedFloatArray(indices, probs, true));
  }

View Full Code Here

                            (float) env.getDefaultDf(), (float) env.getDefaultCf());
  }


  private void preparePostings(String postingsPath) throws Exception {
    postings = new HMapIV<CompressedPositionalPostings>();
    dfs = new HMapII();
    docLengths = new HMapII();


    FSDataInputStream input = fs.open(new Path(postingsPath));
    int termid = input.readInt();
    while(termid != -1) {
      dfs.put(termid, input.readInt());

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of edu.umd.cloud9.collection.wikipedia.WikipediaPage

bak.pcj.IntIterator

edu.umd.cloud9.example.bfs.BfsNodeTest

edu.umd.cloud9.example.pagerank.PageRankNodeTest

edu.umd.cloud9.io.array.ArrayListOfIntsWritable

edu.umd.cloud9.io.fastutil.Int2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.Int2IntOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2IntOpenHashMapWritableTest

edu.umd.cloud9.io.map.HMapIDW

edu.umd.cloud9.io.map.HMapIDWTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.