Examples of StopWords


Examples of ch.akuhn.hapax.corpus.Stopwords

@RunWith(JExample.class)
public class StopwordsTest {
 
  @Test
  public void stopWords() {
    Stopwords basicEnglish = Stopwords.BASIC_ENGLISH;
    assertTrue(basicEnglish.contains("a"));
  }
View Full Code Here

Examples of com.openkm.kea.stopwords.Stopwords

      } catch (Exception e) {
        log.error("Error creating class instance", e);
      }
    }

    Stopwords stopwords = null;
    if (stopwordsClassName != null) {
      try {
        @SuppressWarnings("rawtypes")
        Class clazz = Class.forName(stopwordsClassName);
        stopwords = (Stopwords) clazz.newInstance();
View Full Code Here

Examples of org.fnlp.nlp.corpus.StopWords

public class KeyWordExtraction {
 
  public static void main(String[] args) throws Exception {
   
   
    StopWords sw= new StopWords("../models/stopwords");
    CWSTagger seg = new CWSTagger("../models/seg.m");
    AbstractExtractor key = new WordExtract(seg,sw);
   
    System.out.println(key.extract("甬温线特别重大铁路交通事故车辆经过近24小时的清理工作,26日深夜已经全部移出事故现场,之前埋下的D301次动车车头被挖出运走", 20, true));
   
View Full Code Here

Examples of org.fnlp.nlp.corpus.StopWords

    BufferedReader in = new BufferedReader(new InputStreamReader(
        new FileInputStream(infile ), "utf8"));

    //    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
//        outfile), enc2));
    StopWords sw = new StopWords(stopwordfile);
   
    LabelAlphabet dict = new LabelAlphabet();
    // words in documents
    ArrayList<TIntArrayList> documentsList= new ArrayList<TIntArrayList>();
   
   
    String line = null;
    while ((line = in.readLine()) != null) {
      line = line.trim()
      if(line.length()==0)
        continue;
      String[] toks = line.split("\\s+");
      TIntArrayList wordlist = new TIntArrayList();
      for(int j=0;j<toks.length;j++){
        String tok = toks[j];
        if(sw.isStopWord(tok))
          continue;
        int idx = dict.lookupIndex(tok);
        wordlist.add(idx);
      }
      documentsList.add(wordlist);
View Full Code Here

Examples of org.fnlp.nlp.corpus.StopWords

    dN = 0.85;
  }
 
  public WordExtract(String segPath, String dicPath) throws Exception{
    tag = new CWSTagger(segPath);
    test = new StopWords(dicPath);
  }
View Full Code Here

Examples of org.fnlp.nlp.corpus.StopWords

    test = new StopWords(dicPath);
  }
 
  public WordExtract(CWSTagger tag, String dicPath){
    this.tag = tag;
    test = new StopWords(dicPath);
  }
View Full Code Here

Examples of weka.core.Stopwords

  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }

    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }

    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    long pruneRate =
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }

      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) {
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));

    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();

      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();

      word = m_Stemmer.stem(word);

      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;

      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here

Examples of weka.core.Stopwords

  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }

    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }

    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    long pruneRate =
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }

      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) {
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));

    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();

      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();

      word = m_Stemmer.stem(word);

      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;

      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here

Examples of weka.core.Stopwords

    } else {
      m_inputVector.clear();
    }
   
    if (m_useStopList && m_stopwords == null) {
      m_stopwords = new Stopwords();
      try {
        if (getStopwords().exists() && !getStopwords().isDirectory()) {
          m_stopwords.read(getStopwords());
        }
      } catch (Exception ex) {
View Full Code Here

Examples of weka.core.Stopwords

  /**
   * determines the dictionary.
   */
  private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
      try {
  if (getStopwords().exists() && !getStopwords().isDirectory())
    stopwords.read(getStopwords());
      }
      catch (Exception e) {
  e.printStackTrace();
      }
    }

    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
      values = getInputFormat().attribute(classInd).numValues();
    }

    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    long pruneRate =
      Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
  vInd = (int)instance.classValue();
      }

      // Iterate through all relevant string attributes of the current instance
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) {
  if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

    // Get tokenizer
    m_Tokenizer.tokenize(instance.stringValue(j));

    // Iterate through tokens, perform stemming, and remove stopwords
    // (if required)
    while (m_Tokenizer.hasMoreElements()) {
      String word = ((String)m_Tokenizer.nextElement()).intern();

      if(this.m_lowerCaseTokens==true)
        word = word.toLowerCase();

      word = m_Stemmer.stem(word);

      if(this.m_useStoplist==true)
        if(stopwords.is(word))
    continue;

      if(!(h.contains(word)))
        h.put(word, new Integer(0));

View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.