Package de.jetwick.tw.cmd

Source Code of de.jetwick.tw.cmd.TermCreateCommandTest

/**
* Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*         http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.tw.cmd;

import de.jetwick.data.UrlEntry;
import de.jetwick.data.JTweet;
import de.jetwick.data.JUser;
import de.jetwick.tw.FakeUrlExtractor;
import de.jetwick.tw.TweetDetector;
import de.jetwick.util.MyDate;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.Map.Entry;
import org.junit.Test;

import static org.junit.Assert.*;

/**
*
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class TermCreateCommandTest {

    public TermCreateCommandTest() {
    }

    static void execute(Collection<JTweet> tweets) {
        execute(tweets, true);
    }

    static void execute(Collection<JTweet> tweets, boolean termRemoving) {
        // remove executor since we only have one remaining command?
        for (JTweet tw : tweets) {
            for (UrlEntry entry : new FakeUrlExtractor().setText(tw.getText()).run().getUrlEntries()) {
                tw.addUrlEntry(entry);
            }
        }
        new SerialCommandExecutor(tweets).add(new TermCreateCommand(termRemoving)).execute();
    }

    static void execute(JTweet tw) {
        execute(Arrays.asList(tw), false);
    }

    @Test
    public void testQuality() {
        JTweet tw1 = createSolrTweet(1L, "@lwr32 #JAVA! "
                + "#COFFEE! #JAVA! #COFFEE! #JAVA! #COFFEE! #JAVA! #COFFEE! #JAVA! #COFFEE! #JAVA! #COFFEE! #JAVA! #COFFEE! #JAVA!", "usera");
        JTweet tw2 = createSolrTweet(2L, "@meggytron JAH-VA! java java java java "
                + "java java java. /Dante's Peak #requirescaffeine mashup", "userb");
        JTweet tw3 = createSolrTweet(3L, "@ierinleker ...JAVA JAVA JAVA JAVA JAVA "
                + "JAVA JAVA http://twitpic.com/2kk65u", "userc");
        JTweet tw4 = createSolrTweet(4L, "java", "userd");

        execute(Arrays.asList(tw1, tw2, tw3, tw4));

        assertTrue(tw4.getQuality() > tw3.getQuality());
        // both tweets have 7 java terms
        assertEquals(tw3.getQuality(), tw2.getQuality());
        assertTrue(tw2.getQuality() > tw1.getQuality());
    }

    @Test
    public void testQuality2() {
        String[] tweetsAsStr = new String[]{
            "Fernsehen entut Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
            "Fernsehen taek Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
            "Fernsehen stream Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
            "Fernsehen live Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
            "Televisie kijken Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
            "Fernsehen Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
            "Fernsehen kijken Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
            "Fernsehen Televisie Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
            "Fernsehen Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html#1live",
            "Fernsehen Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html#1"
        };
        List<JTweet> list = new ArrayList();
        int counter = 0;
        JUser user = new JUser("sakilamahipallb");
        for (String tw : tweetsAsStr) {
            counter++;
            list.add(new JTweet(counter, tw, user).setCreatedAt(new Date(counter)));
        }
        execute(list);

        counter = 0;
        int spamCounter = 0;
        for (JTweet tw : list) {
            if (counter++ > 0) {
                assertTrue("tweet:" + tw, tw.getQuality() < JTweet.QUAL_LOW);

                if (tw.getQuality() < JTweet.QUAL_SPAM)
                    spamCounter++;
            }
        }
        // a lot of those tweets are spam - not only bad!
        assertTrue(spamCounter > 5);

        user = new JUser("user2");
        JTweet tw1 = new JTweet(1L, "E Grant Rd / N Swan Rd Accident no injury (Tue 3:24 PM)  http://tinyurl.com/5hwubc", user).setCreatedAt(new Date(1));
        JTweet tw2 = new JTweet(2L, "N Columbus Bl / E Grant Rd Accident no injury (Tue 3:26 PM)  http://tinyurl.com/658t96", user).setCreatedAt(new Date(2));
        execute(Arrays.asList(tw1, tw2));
//        assertTrue("tweet:" + tw1, tw1.getQuality() < SolrTweet.QUAL_MAX);
        assertTrue("tweet:" + tw1, tw1.getQuality() > JTweet.QUAL_SPAM);
        assertTrue("tweet:" + tw2, tw2.getQuality() < JTweet.QUAL_MAX);
        assertTrue("tweet:" + tw2, tw2.getQuality() > JTweet.QUAL_SPAM);

        user = new JUser("user2");
        tw1 = new JTweet(1L, "Werder Bremen verliert sein Heimspiel gegen Twente http://goo.gl/fb/fKFEi #werder #svw", user).setCreatedAt(new Date(1));
        tw2 = new JTweet(2L, "Werder Bremen verliert gegen Twente Enschede http://goo.gl/fb/O8maL #werder #svw", user).setCreatedAt(new Date(2));
        execute(Arrays.asList(tw1, tw2));
        assertTrue("tweet:" + tw1, tw1.getQuality() == JTweet.QUAL_MAX);
        assertTrue("tweet:" + tw2, tw2.getQuality() < JTweet.QUAL_MAX);
        assertTrue("tweet:" + tw2, tw2.getQuality() > JTweet.QUAL_SPAM);
    }

    @Test
    public void testDecreaseQualityOnlyOnce() {
        String url1, url2, url3;
        url1 = url2 = url3 = "http://watchlivefree.blogspot.com";
        String[] tweetsAsStr = new String[]{
            "blap notspamword " + url1,
            "blup secondnotspamword " + url2,
            "bli secondsomething" + url3};

        JUser user = new JUser("user1");
        JTweet tw1 = new JTweet(1L, tweetsAsStr[0], user).setCreatedAt(new Date(1L));
        tw1.getUrlEntries().add(new UrlEntry(5, 123, url1).setResolvedTitle("title1"));
        JTweet tw2 = new JTweet(2L, tweetsAsStr[1], user).setCreatedAt(new Date(2L));
        tw2.getUrlEntries().add(new UrlEntry(5, 123, url2).setResolvedTitle("title2"));
        JTweet tw3 = new JTweet(3L, tweetsAsStr[2], user).setCreatedAt(new Date(3L));
        tw3.getUrlEntries().add(new UrlEntry(5, 123, url3).setResolvedTitle("title3"));

        execute(Arrays.asList(tw1, tw2, tw3));

        assertEquals(JTweet.QUAL_MAX, tw1.getQuality());
        assertTrue(tw2.getQuality() > JTweet.QUAL_SPAM);
        assertTrue(tw3.getQuality() > JTweet.QUAL_SPAM);
    }

//    @Test
//    public void testUrlTitleQuality() {
//        String url1 = "http://watchlivefree.blogspot.different.domain.com",
//                url2 = "http://watchlivefree.blogspot.com";
//        String[] tweetsAsStr = new String[]{
//            "blap notspamword " + url1,
//            "blup secondnotspamword " + url2};
//
//        JUser user = new JUser("user1");
//        JTweet tw1 = new JTweet(1L, tweetsAsStr[0], user).setCreatedAt(new Date(1L));
//        tw1.getUrlEntries().add(new UrlEntry(5, 123, url1).setResolvedTitle("identical title"));
//        JTweet tw2 = new JTweet(2L, tweetsAsStr[1], user).setCreatedAt(new Date(2L));
//        tw2.getUrlEntries().add(new UrlEntry(5, 123, url2).setResolvedTitle("identical title"));
//
//        execute(Arrays.asList(tw1, tw2));
//
//        assertTrue("tweet:" + tw1, tw1.getQuality() > 90);
//        assertTrue("tweet:" + tw2, tw2.getQuality() < 90);
//    }

    @Test
    public void testExecute() {
        JTweet tw = new JTweet(1L, "java lava", new JUser("tmp")).setCreatedAt(new Date(1L));
        execute(tw);
        assertEquals(2, tw.getTextTerms().size());

        JUser u = new JUser("peter");
        tw = new JTweet(1L, "java lava", u);
        JTweet tw2 = new JTweet(2L, "peter java", u).setCreatedAt(new Date(2L));
        execute(tw);
        assertEquals(2, tw.getTextTerms().size());
        assertEquals(2, tw2.getTextTerms().size());
    }

    JTweet createSolrTweet(long id, String twText, String user) {
        return new JTweet(id, twText, new JUser(user)).setCreatedAt(new MyDate(id).toDate());
    }

    @Test
    public void testTermDetection() {
        JUser user = new JUser("Peter");
        user.addOwnTweet(new JTweet(1, "term1 term2 term1", user));
        JTweet tw = new JTweet(2, "term3 not term2 important term3", user);
        user.addOwnTweet(tw);

        execute(tw);
        Collection<Entry<String, Integer>> coll = tw.getTextTerms().entrySet();
        assertEquals(3, (int) coll.size());
        int counter = 0;
        int counter2 = 0;
        for (Entry<String, Integer> e : coll) {
            if (e.getKey().equals("term1"))
                counter++;

            if (e.getKey().equals("not"))
                counter2++;
        }
        assertEquals(0, counter);
        assertEquals(0, counter2);
    }

    @Test
    public void testTermDetection2() {
        JUser user = new JUser("Peter");
        JTweet tw1 = new JTweet(1, "#term1 #term1", user);
        user.addOwnTweet(tw1);
        JTweet tw2 = new JTweet(2, "term1", user);
        user.addOwnTweet(tw2);

        execute(tw2);

        // two tweets with 'term1'
//        assertEquals(2, (int) extractor.run().getSortedTerms().get(0).getValue());
        assertEquals(1, (int) tw2.getTextTerms().size());
    }

    @Test
    public void testTermDetection3() {
        JTweet tw = new JTweet(1L, "A Year Without Rain "
                + "A Year Without Rain A Year Without Rain A Year Without Rain "
                + "A Year Without Rain A Year Without Rain A Year Without Rain", new JUser("peter"));
        execute(tw);
        assertEquals(2, tw.getTextTerms().size());
    }

    @Test
    public void testOtherTweets() {
        JUser u = new JUser("peter");
        JTweet tw1 = new JTweet(1L, "A Year Without Rain Will Give Us desert xyz", u).setCreatedAt(new Date(2L));
        // tw2 is older than tw1
        JTweet tw2 = new JTweet(2L, "A Year Without Rain Will Give Us really fat desert", u).setCreatedAt(new Date(1L));
        JTweet tw3 = new JTweet(3L, "great hui desert", u).setCreatedAt(new Date(0L));
        tw1.setQuality(100);
        tw2.setQuality(89);
        execute(tw1);
        // unchanged
        assertEquals(89, tw2.getQuality());
        assertTrue(tw1.getQuality() < 100);

        tw1.setQuality(100);
        StringFreqMap tFreq = new StringFreqMap();
        StringFreqMap lFreq = new StringFreqMap();
        new TermCreateCommand().checkSpamInExistingTweets(tw1, tFreq, lFreq);
        // without tw1
        assertEquals(9, (int) lFreq.get(TweetDetector.EN));
        assertEquals(1, (int) lFreq.get(TweetDetector.DE));

        assertEquals(6, (int) tw1.getLanguages().get(TweetDetector.EN));

        // without tw1
        assertEquals(2, (int) tFreq.get("desert"));
        assertEquals(1, (int) tFreq.get("hui"));
        assertNull(tFreq.get("xyz"));

        assertEquals(1, (int) tw2.getTextTerms().get("fat"));
    }

    @Test
    public void testLanguageDetection2() {
        JUser user = new JUser("peter");
        JTweet tw1 = new JTweet(0, "this is lastwordIsNotRecognizedBecauseItCouldBeStrippedOut", user);
        execute(tw1);
        assertEquals(2, tw1.getLanguages().get(TweetDetector.EN).intValue());
        assertEquals(TweetDetector.UNKNOWN_LANG, tw1.getLanguage());

        // now the language is detected because a lot noise NOISE_WORDS were found
        JTweet tw = new JTweet(2, "viele ist dort deutscher Tweet!", user);
        execute(tw);
        assertEquals(TweetDetector.DE, tw.getLanguage());

        user = new JUser("peter");
        tw = new JTweet(3L, "Togos with @munckytown on lunch break. "
                + "Hall and Oates \"kiss on my list\" is playing... groovy", user);
        execute(tw);
        assertEquals(TweetDetector.EN, tw.getLanguage());

        user = new JUser("peter");
        tw = new JTweet(4L, "@ibood Bedankt voor de code! :-)", user);
        execute(tw);
        // only de and en are known so detect as unknown!
        assertEquals(TweetDetector.UNKNOWN_LANG, tw.getLanguage());

        // now detect the nl language
        tw = new JTweet(5L, "@MrDeek Klinkt goed toch, een bestek set is altijd leuk om te krijgen of te geven!", user);
        execute(tw);
        assertEquals(TweetDetector.NL, tw.getLanguage());
    }

    @Test
    public void testLanguageDetection3() {
        JTweet tw = new JTweet(1L, "tmptext", new JUser("tmp"));
        tw.getLanguages().inc("de", 1);
        StringFreqMap otherLanguages = new StringFreqMap();
        assertEquals(TweetDetector.UNKNOWN_LANG, new TermCreateCommand().detectLanguage(tw, otherLanguages));

        tw = new JTweet(1L, "tmptext", new JUser("tmp"));
        tw.getLanguages().inc("de", 2);
        otherLanguages = new StringFreqMap().set("de", 1);
        assertEquals("de", new TermCreateCommand().detectLanguage(tw, otherLanguages));

        tw = new JTweet(1L, "tmptext", new JUser("tmp"));
        tw.getLanguages().inc(TweetDetector.UNKNOWN_LANG, 2);
        tw.getLanguages().inc("de", 2);
        otherLanguages = new StringFreqMap().set("de", 1);
        assertEquals("de", new TermCreateCommand().detectLanguage(tw, otherLanguages));

        tw = new JTweet(1L, "tmptext", new JUser("tmp"));
        tw.getLanguages().inc(TweetDetector.UNKNOWN_LANG, 2);
        tw.getLanguages().inc("de", 2);
        tw.getLanguages().inc("en", 2);
        otherLanguages = new StringFreqMap().set("de", 1).set("en", 1);
        assertEquals(TweetDetector.UNKNOWN_LANG, new TermCreateCommand().detectLanguage(tw, otherLanguages));
    }

//    @Test
//    public void testSignature() {
//        SolrTweet tw = new SolrTweet(1L, "wtf wtf text", new SolrUser("tmp"));
//        new TermCreateCommand().calcTermsWithoutNoise(tw);
//        assertTrue(tw.getTextSignature().size() > 0);
//        SolrTweet tw2 = new SolrTweet(2L, "wtf wtf text", new SolrUser("tmp"));
//        new TermCreateCommand().calcTermsWithoutNoise(tw2);
//        assertEquals(tw.getTextSignature(), tw2.getTextSignature());
//
//        SolrTweet tw3 = new SolrTweet(3L, "wtf wtf text wikileaks info", new SolrUser("tmp"));
//        new TermCreateCommand().calcTermsWithoutNoise(tw3);
//        int counter = 0;
//        for (Long val : tw3.getTextSignature()) {
//            if (tw2.getTextSignature().contains(val))
//                counter++;
//        }
//        assertTrue("At least on signature should be identical for tweet2 and tweet3", counter > 0);
//    }   
}
TOP

Related Classes of de.jetwick.tw.cmd.TermCreateCommandTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.