Package cc.mallet.pipe

Examples of cc.mallet.pipe.TokenSequenceRemoveStopwords


    // Pipes: lowercase, tokenize, remove stopwords, map to features
    pipeList.add( new CharSequenceLowercase() );
    //word format by Regular expression
    pipeList.add( new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")) );
    pipeList.add( new TokenSequenceRemoveStopwords(new File("stoplist/en.txt"), "UTF-8", false, false, false) );
    //add bigram words
    //pipeList.add(new TokenSequenceNGrams(new int[] {2} ));
       
    //convert to feature
    pipeList.add( new TokenSequence2FeatureSequence() );
View Full Code Here


        Pattern tokenPattern = Pattern.compile("\\S[\\S]+\\S");
        int[] sizes = {1,2};
        ArrayList pipeList = new ArrayList();

        pipeList.add(new CharSequence2TokenSequence(tokenPattern));
        pipeList.add(new TokenSequenceRemoveStopwords(false, false)); // we should use a real stop word list
        pipeList.add(new TokenSequenceNGramsDelim(sizes, " "));
        pipeList.add(new TokenSequence2FeatureSequence());
        return new SerialPipes(pipeList);
    }
View Full Code Here

        // Normalize all tokens to all lowercase
        pipeList.add(new TokenSequenceLowercase());

        // Remove stopwords from a standard English stoplist.
        //  options: [case sensitive] [mark deletions]
        pipeList.add(new TokenSequenceRemoveStopwords(false, false));

        // Rather than storing tokens as strings, convert
        //  them to integers by looking them up in an alphabet.
        pipeList.add(new TokenSequence2FeatureSequence());
View Full Code Here

        // Normalize all tokens to all lowercase
        pipeList.add(new TokenSequenceLowercase());

        // Remove stopwords from a standard English stoplist.
        //  options: [case sensitive] [mark deletions]
        pipeList.add(new TokenSequenceRemoveStopwords(false, false));

        // Rather than storing tokens as strings, convert
        //  them to integers by looking them up in an alphabet.
        pipeList.add(new TokenSequence2FeatureSequence());
View Full Code Here

TOP

Related Classes of cc.mallet.pipe.TokenSequenceRemoveStopwords

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.