Package de.jetwick.tw

Source Code of de.jetwick.tw.TweetDetectorTest

/**
* Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*         http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package de.jetwick.tw;

import java.io.IOException;
import de.jetwick.data.JTweet;
import de.jetwick.data.JUser;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.junit.Test;
import static org.junit.Assert.*;

/**
*
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class TweetDetectorTest {

    public TweetDetectorTest() {
    }

    @Test
    public void testRun() {
        List<JTweet> tweets = new ArrayList<JTweet>();
        tweets.add(createTweet(1, "term5 term2 term3 term4!"));
        tweets.add(createTweet(1, "term o"));
        tweets.add(createTweet(1, "TERM term6 Gehts?"));
        TweetDetector extractor = new TweetDetector(tweets);
        extractor.setTermMaxCount(10);
        List<Entry<String, Integer>> mostFrequentTerms = extractor.run().getSortedTerms();

        assertEquals(7, mostFrequentTerms.size());
        assertEquals("term", mostFrequentTerms.get(0).getKey());
        assertEquals(2, (int) mostFrequentTerms.get(0).getValue());
    }

    @Test
    public void testSkipUser() {
        List<JTweet> tweets = new ArrayList<JTweet>();
        tweets.add(createTweet(1L, "@userA term @userB term4!"));
        tweets.add(createTweet(2L, "@userA term o"));
        TweetDetector extractor = new TweetDetector(tweets);
        extractor.setTermMaxCount(10);
        List<Entry<String, Integer>> mostFrequentTerms = extractor.run().getSortedTerms();

        // ignore o which is too short
        // ignore @usera which starts with @
        assertEquals(2, mostFrequentTerms.size());
        assertEquals("term", mostFrequentTerms.get(0).getKey());
        assertEquals(2, (int) mostFrequentTerms.get(0).getValue());
    }

    @Test
    public void testUrlsInTerms() {
        List<JTweet> tweets = new ArrayList<JTweet>();
        tweets.add(createTweet(1, "the god http://www.jetwick.com/hihiho/test.html <b>http</b>://<b>bit</b>.ly/9FZv5E"));
        List<Entry<String, Integer>> mostFrequentTerms = createExtractor(tweets).run().getSortedTerms();
//        System.out.println(mostFrequentTerms);
        assertEquals(1, mostFrequentTerms.size());
    }

    @Test
    public void testTermsWithRemove() {
        List<JTweet> tweets = new ArrayList<JTweet>();
        tweets.add(createTweet(1, "the god"));
        tweets.add(createTweet(2, "the thing"));
        tweets.add(createTweet(3, "it's now"));
        List<Entry<String, Integer>> mostFrequentTerms = createExtractor(tweets).run().getSortedTerms();

        // remove "the", "it's" and "now"
        assertEquals(2, mostFrequentTerms.size());

        tweets.add(createTweet(4, "p d"));
        mostFrequentTerms = createExtractor(tweets).run().getSortedTerms();
        assertEquals(2, mostFrequentTerms.size());

        tweets.add(createTweet(5, "we're --"));
        mostFrequentTerms = createExtractor(tweets).run().getSortedTerms();
        assertEquals(2, mostFrequentTerms.size());

        tweets.clear();
        tweets.add(createTweet(6, "c++"));
        tweets.add(createTweet(7, "c#"));
        tweets.add(createTweet(8, "DivaDOD:"));

        mostFrequentTerms = createExtractor(tweets).run().getSortedTerms();
        assertEquals(3, mostFrequentTerms.size());
        assertEquals("c++", mostFrequentTerms.get(0).getKey());
        assertEquals("c#", mostFrequentTerms.get(1).getKey());
        assertEquals("divadod", mostFrequentTerms.get(2).getKey());
    }

    @Test
    public void testStripNoiseFromWords() {
        assertEquals("@hi@", TweetDetector.stripNoiseFromWord("@hi@"));
        assertEquals("pet ", TweetDetector.stripNoiseFromWord("pet."));
        assertEquals("@peter ", TweetDetector.stripNoiseFromWord("@peter."));
        assertEquals("@pet er ", TweetDetector.stripNoiseFromWord("@pet,er!"));
        assertEquals("@pet er ", TweetDetector.stripNoiseFromWord("@pet,er!"));
        assertEquals("@peter_mueller", TweetDetector.stripNoiseFromWord("@<b>peter</b>_mueller"));
        assertEquals("  peter  ", TweetDetector.stripNoiseFromWord(">>peter<<"));
        assertEquals(" peter ", TweetDetector.stripNoiseFromWord("\"peter\""));
        assertEquals("don't", TweetDetector.stripNoiseFromWord("don't"));
        assertEquals("hi how are you ", TweetDetector.stripNoiseFromWord("hi\nhow\tare you?"));

        assertEquals("all things after urls should remain!",
                "  hi", TweetDetector.stripNoiseFromWord("http://blibla.de hi"));
        assertEquals("test_t   test", TweetDetector.stripNoiseFromWord("test_t https://www.stupid.de test"));
        assertEquals(" ", TweetDetector.stripNoiseFromWord("http://blibla.de"));
        assertEquals(" ", TweetDetector.stripNoiseFromWord(" "));
        assertEquals("  test", TweetDetector.stripNoiseFromWord("http:// test"));
        assertEquals("http test", TweetDetector.stripNoiseFromWord("http&test"));
        assertEquals("http test ", TweetDetector.stripNoiseFromWord("http;test;"));
    }
   
    @Test
    public void testLanguageDetection() {
        // skip the noise words and last terms for language detection:
        List<JTweet> tweets = new ArrayList<JTweet>();
        tweets.add(createTweet(1, "das geht ja ab!"));
        Map<String, Integer> langs = createExtractor(tweets).run().getLanguages();
        assertEquals(3, langs.get(TweetDetector.DE).intValue());
    }

    @Test
    public void testTerms() {
        JUser user = new JUser("Peter");
        user.addOwnTweet(new JTweet(1, "test pest alpha", user));
        user.addOwnTweet(new JTweet(2, "alpha", user));

        assertEquals(3, (int) createExtractor(user.getOwnTweets()).run().getSortedTerms().size());
        assertEquals(2, (int) createExtractor(user.getOwnTweets()).run().getSortedTerms().get(0).getValue());
    }

    @Test
    public void testChinAnalyzing() throws IOException {
         JUser user = new JUser("Peter");
        user.addOwnTweet(new JTweet(1, "飼い主さん!!ペットを迎えに行ってください!!", user));
        user.addOwnTweet(new JTweet(2, "希望を捨てないでくださいペット", user));
       
//        assertEquals(1, (int) createExtractor(user.getOwnTweets()).run().getSortedTerms().size());
//        assertEquals(2, (int) createExtractor(user.getOwnTweets()).run().getSortedTerms().get(0).getValue());
    }
   
    TweetDetector createExtractor(Collection<JTweet> tweets) {
        return new TweetDetector(tweets);
    }

    JTweet createTweet(long id, String twText) {
        return new JTweet(id, twText, new JUser("tmp")).setCreatedAt(new Date(id));
    }
}
TOP

Related Classes of de.jetwick.tw.TweetDetectorTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.