Examples of StopWords

ch.akuhn.hapax.corpus.Stopwords
com.openkm.kea.stopwords.Stopwords
org.fnlp.nlp.corpus.StopWords
本类主要功能是过滤停用词 @author ltian
weka.core.Stopwords
s.cmu.edu/~mccallum/bow/rainbow/" target="_blank">Rainbow.
Accepts the following parameter:
-i file
loads the stopwords from the given file
-o file
saves the stopwords to the given file
-p
outputs the current stopwords on stdout
Any additional parameters are interpreted as words to test as stopwords. @author Eibe Frank (eibe@cs.waikato.ac.nz) @author Ashraf M. Kibriya (amk14@cs.waikato.ac.nz) @author FracPete (fracpete at waikato dot ac dot nz) @version $Revision: 1.6 $

Examples of ch.akuhn.hapax.corpus.Stopwords

@RunWith(JExample.class)
public class StopwordsTest {
  
  @Test
  public void stopWords() {
    Stopwords basicEnglish = Stopwords.BASIC_ENGLISH;
    assertTrue(basicEnglish.contains("a"));
  }

View Full Code Here

Examples of com.openkm.kea.stopwords.Stopwords

      } catch (Exception e) {
        log.error("Error creating class instance", e);
      }
    }


    Stopwords stopwords = null;
    if (stopwordsClassName != null) {
      try {
        @SuppressWarnings("rawtypes")
        Class clazz = Class.forName(stopwordsClassName);
        stopwords = (Stopwords) clazz.newInstance();

View Full Code Here

Examples of org.fnlp.nlp.corpus.StopWords

public class KeyWordExtraction {
  
  public static void main(String[] args) throws Exception {
    
    
    StopWords sw= new StopWords("../models/stopwords");
    CWSTagger seg = new CWSTagger("../models/seg.m");
    AbstractExtractor key = new WordExtract(seg,sw);
    
    System.out.println(key.extract("甬温线特别重大铁路交通事故车辆经过近24小时的清理工作，26日深夜已经全部移出事故现场，之前埋下的D301次动车车头被挖出运走", 20, true));

View Full Code Here

Examples of org.fnlp.nlp.corpus.StopWords

    BufferedReader in = new BufferedReader(new InputStreamReader(
        new FileInputStream(infile ), "utf8"));


    //    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
//        outfile), enc2));
    StopWords sw = new StopWords(stopwordfile);
    
    LabelAlphabet dict = new LabelAlphabet();
    // words in documents
    ArrayList<TIntArrayList> documentsList= new ArrayList<TIntArrayList>();
    
    
    String line = null;
    while ((line = in.readLine()) != null) {
      line = line.trim();  
      if(line.length()==0)
        continue;
      String[] toks = line.split("\\s+");
      TIntArrayList wordlist = new TIntArrayList(); 
      for(int j=0;j<toks.length;j++){
        String tok = toks[j];
        if(sw.isStopWord(tok))
          continue;
        int idx = dict.lookupIndex(tok);
        wordlist.add(idx);
      }
      documentsList.add(wordlist);

View Full Code Here

Examples of org.fnlp.nlp.corpus.StopWords

    dN = 0.85;
  }
  
  public WordExtract(String segPath, String dicPath) throws Exception{
    tag = new CWSTagger(segPath);
    test = new StopWords(dicPath);
  }

View Full Code Here

Examples of org.fnlp.nlp.corpus.StopWords

    test = new StopWords(dicPath);
  }
  
  public WordExtract(CWSTagger tag, String dicPath){
    this.tag = tag;
    test = new StopWords(dicPath);
  }

View Full Code Here

Examples of weka.core.Stopwords

  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }


    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }


    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }


    // Make sure we know which fields to convert
    determineSelectedRange();


    // Tokenize all training text into an orderedMap of "words".
    long pruneRate = 
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }


      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) { 
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {


    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));


    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();


      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();


      word = m_Stemmer.stem(word);


      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;


      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here

Examples of weka.core.Stopwords

  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }


    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }


    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }


    // Make sure we know which fields to convert
    determineSelectedRange();


    // Tokenize all training text into an orderedMap of "words".
    long pruneRate = 
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }


      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) { 
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {


    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));


    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();


      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();


      word = m_Stemmer.stem(word);


      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;


      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here

Examples of weka.core.Stopwords

    } else {
      m_inputVector.clear();
    }
    
    if (m_useStopList && m_stopwords == null) {
      m_stopwords = new Stopwords();
      try {
        if (getStopwords().exists() && !getStopwords().isDirectory()) {
          m_stopwords.read(getStopwords());
        }
      } catch (Exception ex) {

View Full Code Here

Examples of weka.core.Stopwords

  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }


    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }


    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }


    // Make sure we know which fields to convert
    determineSelectedRange();


    // Tokenize all training text into an orderedMap of "words".
    long pruneRate = 
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }


      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) { 
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {


    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));


    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();


      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();


      word = m_Stemmer.stem(word);


      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;


      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here

0 1

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.