Examples of NGramModel

opennlp.tools.ngram.NGramModel
The {@link NGramModel} can be used to crate ngrams and character ngrams. @see StringList

Examples of opennlp.tools.ngram.NGramModel

    this(2, 5);
  }


  public void createFeatures(List<String> features, String[] tokens, int index, String[] preds) {


    NGramModel model = new NGramModel();
    model.add(tokens[index], minLength, maxLength);


    for (Iterator<StringList> it = model.iterator(); it.hasNext();) {


      StringList tokenList = it.next();


      if (tokenList.size() > 0) {
        features.add("ng=" + tokenList.getToken(0).toLowerCase());

View Full Code Here

Examples of opennlp.tools.ngram.NGramModel

    if (cutoffString != null) {
      // TODO: Maybe throw illegal argument exception if not parse able
      cutoff = Integer.parseInt(cutoffString);
    }
    
    NGramModel mdict = new NGramModel();
    Parse p;
    while((p = data.read()) != null) {
      p.updateHeads(rules);
      Parse[] pwords = p.getTagNodes();
      String[] words = new String[pwords.length];
      //add all uni-grams
      for (int wi=0;wi<words.length;wi++) {
        words[wi] = pwords[wi].toString();
      }


      mdict.add(new StringList(words), 1, 1);
      //add tri-grams and bi-grams for inital sequence
      Parse[] chunks = collapsePunctuation(ParserEventStream.getInitialChunks(p),rules.getPunctuationTags());
      String[] cwords = new String[chunks.length];
      for (int wi=0;wi<cwords.length;wi++) {
        cwords[wi] = chunks[wi].getHead().toString();
      }
      mdict.add(new StringList(cwords), 2, 3);


      //emulate reductions to produce additional n-grams
      int ci = 0;
      while (ci < chunks.length) {
        //System.err.println("chunks["+ci+"]="+chunks[ci].getHead().toString()+" chunks.length="+chunks.length);
        if (lastChild(chunks[ci], chunks[ci].getParent(),rules.getPunctuationTags())) {
          //perform reduce
          int reduceStart = ci;
          while (reduceStart >=0 && chunks[reduceStart].getParent() == chunks[ci].getParent()) {
            reduceStart--;
          }
          reduceStart++;
          chunks = ParserEventStream.reduceChunks(chunks,ci,chunks[ci].getParent());
          ci = reduceStart;
          if (chunks.length != 0) {
            String[] window = new String[5];
            int wi = 0;
            if (ci-2 >= 0) window[wi++] = chunks[ci-2].getHead().toString();
            if (ci-1 >= 0) window[wi++] = chunks[ci-1].getHead().toString();
            window[wi++] = chunks[ci].getHead().toString();
            if (ci+1 < chunks.length) window[wi++] = chunks[ci+1].getHead().toString();
            if (ci+2 < chunks.length) window[wi++] = chunks[ci+2].getHead().toString();
            if (wi < 5) {
              String[] subWindow = new String[wi];
              for (int swi=0;swi<wi;swi++) {
                subWindow[swi]=window[swi];
              }
              window = subWindow;
            }
            if (window.length >=3) {
              mdict.add(new StringList(window), 2, 3);
            }
            else if (window.length == 2) {
              mdict.add(new StringList(window), 2, 2);
            }
          }
          ci=reduceStart-1; //ci will be incremented at end of loop
        }
        ci++;
      }
    }
    //System.err.println("gas,and="+mdict.getCount((new TokenList(new String[] {"gas","and"}))));
    mdict.cutoff(cutoff, Integer.MAX_VALUE);
    return mdict.toDictionary(true);
  }

View Full Code Here

Examples of opennlp.tools.ngram.NGramModel

  }
  
  public static Dictionary buildNGramDictionary(ObjectStream<POSSample> samples, int cutoff)
      throws IOException {
    
    NGramModel ngramModel = new NGramModel();
    
    POSSample sample;
    while((sample = samples.read()) != null) {
      String[] words = sample.getSentence();
      
      if (words.length > 0)
        ngramModel.add(new StringList(words), 1, 1);
    }
    
    ngramModel.cutoff(cutoff, Integer.MAX_VALUE);
    
    return ngramModel.toDictionary(true);
  }

View Full Code Here

Examples of opennlp.tools.ngram.NGramModel


  private static void buildDictionary(String dict, File inFile, int cutoff)
      throws FileNotFoundException, IOException {
    System.err.println("Building dictionary");


    NGramModel ngramModel = new NGramModel();


    DataStream data = new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile));
    while(data.hasNext()) {
      String tagStr = (String) data.nextToken();
      String[] tt = tagStr.split(" ");
      String[] words = new String[tt.length];
      for (int wi=0;wi<words.length;wi++) {
        words[wi] =
            tt[wi].substring(0,tt[wi].lastIndexOf('_'));
      }


      ngramModel.add(new StringList(words), 1, 1);
    }


    System.out.println("Saving the dictionary");


    ngramModel.cutoff(cutoff, Integer.MAX_VALUE);
    Dictionary dictionary = ngramModel.toDictionary(true);


    dictionary.serialize(new FileOutputStream(dict));
  }

View Full Code Here

Examples of opennlp.tools.ngram.NGramModel

    if (cutoffString != null) {
      // TODO: Maybe throw illegal argument exception if not parse able
      cutoff = Integer.parseInt(cutoffString);
    }
    
    NGramModel mdict = new NGramModel();
    Parse p;
    while((p = data.read()) != null) {
      p.updateHeads(rules);
      Parse[] pwords = p.getTagNodes();
      String[] words = new String[pwords.length];
      //add all uni-grams
      for (int wi=0;wi<words.length;wi++) {
        words[wi] = pwords[wi].getCoveredText();
      }


      mdict.add(new StringList(words), 1, 1);
      //add tri-grams and bi-grams for inital sequence
      Parse[] chunks = collapsePunctuation(ParserEventStream.getInitialChunks(p),rules.getPunctuationTags());
      String[] cwords = new String[chunks.length];
      for (int wi=0;wi<cwords.length;wi++) {
        cwords[wi] = chunks[wi].getHead().getCoveredText();
      }
      mdict.add(new StringList(cwords), 2, 3);


      //emulate reductions to produce additional n-grams
      int ci = 0;
      while (ci < chunks.length) {
        //System.err.println("chunks["+ci+"]="+chunks[ci].getHead().getCoveredText()+" chunks.length="+chunks.length);
        if (lastChild(chunks[ci], chunks[ci].getParent(),rules.getPunctuationTags())) {
          //perform reduce
          int reduceStart = ci;
          while (reduceStart >=0 && chunks[reduceStart].getParent() == chunks[ci].getParent()) {
            reduceStart--;
          }
          reduceStart++;
          chunks = ParserEventStream.reduceChunks(chunks,ci,chunks[ci].getParent());
          ci = reduceStart;
          if (chunks.length != 0) {
            String[] window = new String[5];
            int wi = 0;
            if (ci-2 >= 0) window[wi++] = chunks[ci-2].getHead().getCoveredText();
            if (ci-1 >= 0) window[wi++] = chunks[ci-1].getHead().getCoveredText();
            window[wi++] = chunks[ci].getHead().getCoveredText();
            if (ci+1 < chunks.length) window[wi++] = chunks[ci+1].getHead().getCoveredText();
            if (ci+2 < chunks.length) window[wi++] = chunks[ci+2].getHead().getCoveredText();
            if (wi < 5) {
              String[] subWindow = new String[wi];
              for (int swi=0;swi<wi;swi++) {
                subWindow[swi]=window[swi];
              }
              window = subWindow;
            }
            if (window.length >=3) {
              mdict.add(new StringList(window), 2, 3);
            }
            else if (window.length == 2) {
              mdict.add(new StringList(window), 2, 2);
            }
          }
          ci=reduceStart-1; //ci will be incremented at end of loop
        }
        ci++;
      }
    }
    //System.err.println("gas,and="+mdict.getCount((new TokenList(new String[] {"gas","and"}))));
    mdict.cutoff(cutoff, Integer.MAX_VALUE);
    return mdict.toDictionary(true);
  }

View Full Code Here

Examples of opennlp.tools.ngram.NGramModel

  }
  
  public static Dictionary buildNGramDictionary(ObjectStream<POSSample> samples, int cutoff)
      throws IOException {
    
    NGramModel ngramModel = new NGramModel();
    
    POSSample sample;
    while((sample = samples.read()) != null) {
      String[] words = sample.getSentence();
      
      if (words.length > 0)
        ngramModel.add(new StringList(words), 1, 1);
    }
    
    ngramModel.cutoff(cutoff, Integer.MAX_VALUE);
    
    return ngramModel.toDictionary(true);
  }

View Full Code Here

Examples of opennlp.tools.ngram.NGramModel


  private static void buildDictionary(String dict, File inFile, int cutoff)
      throws FileNotFoundException, IOException {
    System.err.println("Building dictionary");


    NGramModel ngramModel = new NGramModel();


    DataStream data = new opennlp.maxent.PlainTextByLineDataStream(new java.io.FileReader(inFile));
    while(data.hasNext()) {
      String tagStr = (String) data.nextToken();
      String[] tt = tagStr.split(" ");
      String[] words = new String[tt.length];
      for (int wi=0;wi<words.length;wi++) {
        words[wi] =
            tt[wi].substring(0,tt[wi].lastIndexOf('_'));
      }


      ngramModel.add(new StringList(words), 1, 1);
    }


    System.out.println("Saving the dictionary");


    ngramModel.cutoff(cutoff, Integer.MAX_VALUE);
    Dictionary dictionary = ngramModel.toDictionary(true);


    dictionary.serialize(new FileOutputStream(dict));
  }

View Full Code Here

Examples of opennlp.tools.ngram.NGramModel

    this(2, 5);
  }


  public void createFeatures(List<String> features, String[] tokens, int index, String[] preds) {


    NGramModel model = new NGramModel();
    model.add(tokens[index], minLength, maxLength);


    for (StringList tokenList : model) {


      if (tokenList.size() > 0) {
        features.add("ng=" + tokenList.getToken(0).toLowerCase());

View Full Code Here

Examples of opennlp.tools.ngram.NGramModel

    if (cutoffString != null) {
      // TODO: Maybe throw illegal argument exception if not parse able
      cutoff = Integer.parseInt(cutoffString);
    }


    NGramModel mdict = new NGramModel();
    Parse p;
    while((p = data.read()) != null) {
      p.updateHeads(rules);
      Parse[] pwords = p.getTagNodes();
      String[] words = new String[pwords.length];
      //add all uni-grams
      for (int wi=0;wi<words.length;wi++) {
        words[wi] = pwords[wi].getCoveredText();
      }


      mdict.add(new StringList(words), 1, 1);
      //add tri-grams and bi-grams for inital sequence
      Parse[] chunks = collapsePunctuation(ParserEventStream.getInitialChunks(p),rules.getPunctuationTags());
      String[] cwords = new String[chunks.length];
      for (int wi=0;wi<cwords.length;wi++) {
        cwords[wi] = chunks[wi].getHead().getCoveredText();
      }
      mdict.add(new StringList(cwords), 2, 3);


      //emulate reductions to produce additional n-grams
      int ci = 0;
      while (ci < chunks.length) {
        //System.err.println("chunks["+ci+"]="+chunks[ci].getHead().getCoveredText()+" chunks.length="+chunks.length + "  " + chunks[ci].getParent());


        if (chunks[ci].getParent() == null) {
          chunks[ci].show();
        }
        if (lastChild(chunks[ci], chunks[ci].getParent(),rules.getPunctuationTags())) {
          //perform reduce
          int reduceStart = ci;
          while (reduceStart >=0 && chunks[reduceStart].getParent() == chunks[ci].getParent()) {
            reduceStart--;
          }
          reduceStart++;
          chunks = ParserEventStream.reduceChunks(chunks,ci,chunks[ci].getParent());
          ci = reduceStart;
          if (chunks.length != 0) {
            String[] window = new String[5];
            int wi = 0;
            if (ci-2 >= 0) window[wi++] = chunks[ci-2].getHead().getCoveredText();
            if (ci-1 >= 0) window[wi++] = chunks[ci-1].getHead().getCoveredText();
            window[wi++] = chunks[ci].getHead().getCoveredText();
            if (ci+1 < chunks.length) window[wi++] = chunks[ci+1].getHead().getCoveredText();
            if (ci+2 < chunks.length) window[wi++] = chunks[ci+2].getHead().getCoveredText();
            if (wi < 5) {
              String[] subWindow = new String[wi];
              for (int swi=0;swi<wi;swi++) {
                subWindow[swi]=window[swi];
              }
              window = subWindow;
            }
            if (window.length >=3) {
              mdict.add(new StringList(window), 2, 3);
            }
            else if (window.length == 2) {
              mdict.add(new StringList(window), 2, 2);
            }
          }
          ci=reduceStart-1; //ci will be incremented at end of loop
        }
        ci++;
      }
    }
    //System.err.println("gas,and="+mdict.getCount((new TokenList(new String[] {"gas","and"}))));
    mdict.cutoff(cutoff, Integer.MAX_VALUE);
    return mdict.toDictionary(true);
  }

View Full Code Here

Examples of opennlp.tools.ngram.NGramModel

  }


  public static Dictionary buildNGramDictionary(ObjectStream<POSSample> samples, int cutoff)
      throws IOException {


    NGramModel ngramModel = new NGramModel();


    POSSample sample;
    while((sample = samples.read()) != null) {
      String[] words = sample.getSentence();


      if (words.length > 0)
        ngramModel.add(new StringList(words), 1, 1);
    }


    ngramModel.cutoff(cutoff, Integer.MAX_VALUE);


    return ngramModel.toDictionary(true);
  }

View Full Code Here

0 1

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.