Package de.jetwick.data

Source Code of de.jetwick.data.JTweet

/**
* Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*         http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.data;

import de.jetwick.tw.TweetDetector;
import de.jetwick.tw.Twitter4JTweet;
import de.jetwick.tw.cmd.StringFreqMap;
import de.jetwick.util.Helper;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import twitter4j.Tweet;

/**
*
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class JTweet implements ElasticObject<JTweet>, Serializable {

    private static final long serialVersionUID = 1L;
    public static final int MAX_LENGTH = 800;
    public static final Comparator tweetIdComparator = new TwitterIdComparator();
    public static final int QUAL_MAX = 100;
    //
    // detect three other *similar* tweets THEN BAD
    // (LOW/100)^2 = 0.5625; (LOW/100)^3 = 0.4219 < BAD/100
    public static final int QUAL_LOW = 75;
    //
    // detect two other nearly *identical* tweet THEN SPAM
    // (BAD/100)^2 = 0.25 < SPAM/100
    public static final int QUAL_BAD = 50;
    public static final int QUAL_SPAM = 26;
    private final long twitterId;
    private String text;
    private Set<JTweet> replies = new LinkedHashSet<JTweet>();
    private int retweetCount;
    private boolean retweet = false;
    private boolean daemon = false;
    private long version;
    private Date createdAt;
    private Date updatedAt;
    private JUser fromUser;
    private JTweet inReplyOf;
    private long inReplyTwitterId = -1L;
    private String location;
    private StringFreqMap textTerms = new StringFreqMap(8);
    private StringFreqMap languages = new StringFreqMap(4);
    private String language = TweetDetector.UNKNOWN_LANG;
    private int quality;
    private String lowerCaseText;
    private List<UrlEntry> urlEntries;
    private int replyCount;
    private String qualDebug;
    private int qualReductions = 0;
    private Collection<Long> duplicates = new LinkedHashSet<Long>(4);
    private Date instantiatedAt = new Date();
    private String feedSource;
    private double latitude;
    private double longitude;
    private int updateCount;
    private boolean isProtected = false;

    /**
     * You'll need to call init after that constructor
     */
    public JTweet(Tweet tw, JUser fromUser) {
        this(tw);
        setFromUser(fromUser);
    }

    public JTweet(long id, String text, JUser fromUser) {
        this(id, text, new Date());
        setFromUser(fromUser);
    }

    /**
     * For tests only! Use contructor instead which initialized user too!
     */
    public JTweet(Tweet tw) {
        this(tw.getId(), tw.getText(), tw.getCreatedAt());

        // if tweet was retrieved via Status object
        if (tw instanceof Twitter4JTweet) {
            Twitter4JTweet myTw = (Twitter4JTweet) tw;
            inReplyTwitterId = myTw.getInReplyToStatusId();
            urlEntries = myTw.getUrlEntries();
        }

        // most tweets have location == null. See user.location
        if (tw.getGeoLocation() != null)
            setGeoLocation(tw.getGeoLocation().getLatitude(),
                    tw.getGeoLocation().getLongitude());

        location = tw.getLocation();
    }

    /**
     * for tests only
     */
    public JTweet(long id, String text, Date createdAt) {
        quality = QUAL_MAX;
        this.twitterId = id;
        setText_(text);
        this.createdAt = createdAt;

        if (urlEntries == null)
            urlEntries = new ArrayList<UrlEntry>(1);
    }

    public JTweet addUrlEntry(UrlEntry ue) {
        urlEntries.add(ue);
        return this;
    }

    public Collection<UrlEntry> getUrlEntries() {
        return urlEntries;
    }

    public void setUrlEntries(Collection<UrlEntry> entries) {
        getUrlEntries().clear();
        getUrlEntries().addAll(entries);
    }

    @Override
    public long getVersion() {
        return version;
    }

    @Override
    public JTweet setVersion(long version) {
        if (version < 0)
            throw new IllegalStateException("version cannot be negative:" + version);

        this.version = version;
        return this;
    }

    public int getUpdateCount() {
        return updateCount;
    }

    public JTweet setUpdateCount(int updateCount) {
        this.updateCount = updateCount;
        return this;
    }

    public String getLowerCaseText() {
        if (lowerCaseText == null)
            lowerCaseText = getText().toLowerCase();

        return lowerCaseText;
    }

    public String getLocation() {
        return location;
    }

    public void setLocation(String location) {
        this.location = location;
    }

    public StringFreqMap getLanguages() {
        return languages;
    }

    public void setLanguages(StringFreqMap languages) {
        this.languages = languages;
    }

    public String getLanguage() {
        return language;
    }

    public void setLanguage(String language) {
        this.language = language;
    }

    public StringFreqMap getTextTerms() {
        return textTerms;
    }

    public void setTextTerms(StringFreqMap textTerms) {
        this.textTerms = textTerms;
    }

    public long getInReplyTwitterId() {
        return inReplyTwitterId;
    }

    public JTweet setInReplyTwitterId(long inReplyTwitterId) {
        this.inReplyTwitterId = inReplyTwitterId;
        return this;
    }

    public Long getTwitterId() {
        return twitterId;
    }

    public Date getCreatedAt() {
        return createdAt;
    }

    public JTweet setCreatedAt(Date createdAt) {
        this.createdAt = createdAt;
        return this;
    }

    public Date getUpdatedAt() {
        return updatedAt;
    }

    public JTweet setUpdatedAt(Date updatedAt) {
        this.updatedAt = updatedAt;
        return this;
    }

    public JTweet makePersistent() {
        setUpdatedAt(new Date());
        return this;
    }

    /**
     * @return false if this tweet should be deleted after some days
     */
    public boolean isPersistent() {
        return updatedAt != null;
    }

    public void setFromUser(JUser fromUser, boolean reverse) {
        this.fromUser = fromUser;
        if (reverse)
            fromUser.addOwnTweet(this, false);
    }

    public JTweet setFromUser(JUser fromUser) {
        setFromUser(fromUser, true);
        return this;
    }

    public JUser getFromUser() {
        return fromUser;
    }

    public void setReplyCount(int rp) {
        this.replyCount = rp;
    }

    public JTweet addReply(JTweet tw) {
        replies.add(tw);
        tw.setInReplyOf(this);
        return this;
    }

    public int getReplyCount() {
        // TODO better design! (do not mix count and replies)
        return replyCount + replies.size();
    }

    public JTweet getInReplyOf() {
        return inReplyOf;
    }

    public void setInReplyOf(JTweet inReplyOf) {
        this.inReplyOf = inReplyOf;
        if (inReplyOf == null)
            inReplyTwitterId = -1L;
        else
            inReplyTwitterId = inReplyOf.getTwitterId();
    }

    public JTweet setRetweetCount(int rt) {
        this.retweetCount = rt;
        return this;
    }

    public int getRetweetCount() {
        // TODO better design! (do not mix count and replies)
        int tmp = 0;
        for (JTweet tw : replies) {
            if (tw.isRetweet())
                tmp++;
        }
        return retweetCount + tmp;
    }

    public String getText() {
        return text;
    }

    private void setText_(String t) {
        text = t;
        // skip none-utf8 characters, otherwise we have major problems while
        // querying solr
        this.text = Helper.xmlCharacterWhitelist(text);
        retweet = getLowerCaseText().contains("rt @");
    }

    public boolean isRetweet() {
        return retweet;
    }

    public String extractRTText() {
        int index1 = getLowerCaseText().indexOf("rt @");
        if (index1 < 0)
            return "";

        index1 = getText().indexOf(" ", index1 + 4);
        if (index1 < 0)
            return "";

        return getText().substring(index1 + 1).trim();
    }

    public boolean isRetweetOf(JTweet tw) {
        // e.g. return true if this.text == RT @userA: text
        // to lower case is necessary because the case of the fromUser isn't important
        if (!isRetweet())
            return false;

        String thisT = getLowerCaseText();
        String extT = tw.getLowerCaseText();
        return thisT.contains("rt @" + tw.getFromUser() + ": " + extT) || thisT.contains("rt @" + tw.getFromUser() + " " + extT);
//        return thisT.matches(".*rt @" + tw.getFromUser() + ":? " + extT + ".*");
    }

    public JTweet setDaemon(boolean daemon) {
        this.daemon = daemon;
        return this;
    }

    /**
     * If a tweet is added to the system and it is a retweet but no original
     * tweet can be found a daemon tweet will be created to reflect this
     * missing tweet.
     *
     * daemon tweets are expensive to look for and only 0.3% of the tweets (!)
     * are only reactivated daemon tweets!
     */
    public boolean isDaemon() {
        return daemon;
    }

    public int getQuality() {
        return quality;
    }

    public JTweet multiplyQuality(double factor) {
        quality *= factor;
        return this;
    }

    public JTweet setQuality(int quality) {
        this.quality = quality;
        return this;
    }

    public boolean isSpam() {
        return quality < JTweet.QUAL_SPAM && quality >= 0;
    }

    /**
     * For debugging purposes
     */
    public void addQualAction(String str) {
        if (qualDebug == null)
            qualDebug = str;
        else
            qualDebug += str;

        qualReductions++;
    }

    public String getQualDebug() {
        return qualDebug;
    }

    public int getQualReductions() {
        return qualReductions;
    }

    public static boolean isDefaultInReplyId(long inReplyTwitterId) {
        return inReplyTwitterId == -1;
    }

    /**
     * skip tweets with identical id or identical text. For the latter case:
     * greater ids will win and identical text is only skipped if there is no
     * tweet in-between. see the test case
     */
    public static void deduplicate(List<JTweet> list) {
        // now remove tweets if they have the identical twitterId or text.
        // the standard hashCode/equals are based on the twitterId only
        Iterator<JTweet> iter = list.iterator();
        JTweet prevTweet = null;
        while (iter.hasNext()) {
            JTweet tw = iter.next();
            if (prevTweet != null && (tw.getTwitterId().equals(prevTweet.getTwitterId())
                    || tw.getText().equals(prevTweet.getText()))) {
                iter.remove();
            }
            prevTweet = tw;
        }
    }

    public static void sortAndDeduplicate(List<JTweet> list) {
        Collections.sort(list, tweetIdComparator);
        deduplicate(list);
    }

    @Override
    public int hashCode() {
        return 67 * 5 + (int) (this.twitterId ^ (this.twitterId >>> 32));
    }

    @Override
    public boolean equals(Object obj) {
        if (obj == null || getClass() != obj.getClass())
            return false;

        return this.twitterId == ((JTweet) obj).twitterId;
    }

    @Override
    public String toString() {
        return twitterId + " " + createdAt + " " + text + " v" + getVersion();
    }
    public static final Map<String, Set<String>> NOISE_WORDS = new LinkedHashMap<String, Set<String>>();
    public static final Map<String, Set<String>> LANG_DET_WORDS = new LinkedHashMap<String, Set<String>>();
    public static final Set<String> NOISE_WORDS_SINGLE = new LinkedHashSet<String>(Arrays.asList(new String[]{
                "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
            }));
    public static final Set<String> NOISE_WORDS_NUM = new LinkedHashSet<String>(Arrays.asList(new String[]{
                "00", "01", "02", "03", "04", "05", "06", "07", "08", "09",
                "1", "10", "100", "11", "12", "13", "14", "15", "16", "17",
                "18", "19", "2", "20", "21", "22", "23", "24", "25", "26",
                "27", "28", "29", "3", "30", "31", "32", "33", "34", "35",
                "36", "37", "38", "39", "4", "40", "41", "42", "43", "44",
                "45", "46", "47", "48", "49", "5", "50", "51", "52", "53",
                "54", "55", "56", "57", "58", "59", "6", "60", "61", "62",
                "63", "64", "65", "66", "67", "68", "69", "7", "70", "71",
                "72", "73", "74", "75", "76", "77", "78", "79", "8", "80",
                "81", "82", "83", "84", "85", "86", "87", "88", "89", "9",
                "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "100", "000"}));
    public static final Set<String> NOISE_WORDS_MISC = new LinkedHashSet<String>(Arrays.asList(new String[]{
                // ### TWITTER
                "ah",
                "aw", "cu", "ff",
                "haha", "hahaha", "hehe", "hey", "hi",
                "pls",
                "rt", "re", "soo", "thx",
                "yeah", "via",
                "/by", "/cc", "/via",
                "+1", "-1", ";d", "^^",
                // ### MISC
                ".", ",", ";", "ur", "tx", "ini", "ii", "iii",
                "//", "\\n", "\n", "com", "de", "el", "en", "je", "jp", "lol",
                "ne", "om", "ve", "ya", "yr", "za"
            }));
    // ### Ausländisch ###
    public static final Set<String> NOISE_WORDS_UNSORTED = new LinkedHashSet<String>(Arrays.asList(new String[]{
                "¿qué",
                "ak", "aku", "aja", "al", "ada", "amb", "así", "au", "avec",
                "δεν",
                "bien", "boa", "bom", "bueno",
                "ca", "ça", "cap", "ce", "c'est", "cek", "ces", "che", "chi", "ci",
                "col", "com", "como", "con", "crec", "cosa", "cuando", "cumpleaños",
                "dan", "dans", "dc", "del", "decir", "dólar", "dong", "dua",
                "di",
                "ed", "een", "ei", "el", "els", "em", "en", "entre", "era", "és", "est",
                "está", "esta", "estes", "estoy", "eso", "et", "été", "ex",
                "fer", "fu",
                "ga", "ge", "gue",
                "ha", "hay", "han", "het", "ho", "hoy",
                "ik",
                "il", " inte", "iv",
                "jajaja", "je", "jo", "jos", "ju",
                "και",
                "ki",
                "ke",
                "la", "las", "le", "les", "lett", "leur", "li", "lo", "los",
                "mas", "más",
                "mejor", "més", "merci", "ma", "me", "mi", "mon", "muchas", "muy",
                "με",
                "não", "nada",
                "ne", "ni", "nih", "non", "nor", "nos", "notre", "nu",
                "nya",
                "ga", "gracias", "gua", "guau",
                "θα",
                "opció", "ou", "oui",
                "par", "para", "pas", "per", "pero", "por", "pour", "pro",
                "qualche", "que", "qu", "qui",
                "san",
                "se", "sen", "ses", "sí", "si", "sin",
                "sólo", "son", "somme", "soirée", "sous",
                "su", "suis", "sul", "sur", "sus",
                "ta", "també", "te", "té", "tem", "ti", "tinc", "tion", "tive", "todos", "το", "tous", "tra", "très", "tu",
                "uma", "un", "una", "une", "ut",
                "va", "van", "να", "vi", "vie", "vos", "vous", "votre",
                "yang", "για", "yg", "yo", "qué"}));
    public static final Set<String> PHRASE_WHITE_LIST = new LinkedHashSet<String>(Arrays.asList(new String[]{
                "bin laden", // -> otherwise wrong language detection for 'alqaedatracker' because of 'bin'
                "open source"
            }));

    static {
        // fill collection for language detection
        for (String lang : TweetDetector.LANGS) {
            importFrom(LANG_DET_WORDS, lang);
        }

        // fill collection for noise word determination
        for (String lang : TweetDetector.LANGS) {
            importNoiseFrom(NOISE_WORDS, lang);
        }

//        int delta = LANG_DET_WORDS.size();
        for (Entry<String, Set<String>> noiseTerms : NOISE_WORDS.entrySet()) {
            addFrom(LANG_DET_WORDS, noiseTerms);
        }
        //System.out.println("added " + (LANG_DET_WORDS.size() - delta) + " terms to lang detection from noise terms");

        addFrom(NOISE_WORDS, TweetDetector.UNKNOWN_LANG, NOISE_WORDS_UNSORTED);

        // indifferent
        addFrom(NOISE_WORDS, TweetDetector.MISC_TERMS, NOISE_WORDS_MISC);
        addFrom(NOISE_WORDS, TweetDetector.SINGLE_CHAR_TERMS, NOISE_WORDS_SINGLE);
        addFrom(NOISE_WORDS, TweetDetector.NUM_TERMS, NOISE_WORDS_NUM);
    }

    public static void importNoiseFrom(Map<String, Set<String>> words, String lang) {
        try {
            List<String> list = Helper.readFile(Helper.createBuffReader(JTweet.class.getResourceAsStream("noise_words_" + lang + ".txt")));
            addFrom(words, lang, list);
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    public static void importFrom(Map<String, Set<String>> words, String lang) {
        try {
            List<String> list = Helper.readFile(Helper.createBuffReader(JTweet.class.getResourceAsStream("lang_det_" + lang + ".txt")));
            addFrom(words, lang, list);
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    public static void addFrom(Map<String, Set<String>> words, String lang, Collection<String> collection) {
        for (String str : collection) {
            if (str.isEmpty() || str.startsWith("//"))
                continue;

            str = str.trim().toLowerCase();
            Set<String> langs = words.get(str);
            if (langs == null)
                langs = new LinkedHashSet<String>(10);

            langs.add(lang);
            words.put(str, langs);
        }
    }

    public static void addFrom(Map<String, Set<String>> words, Entry<String, Set<String>> entry) {
        String str = entry.getKey();
        if (str.isEmpty() || str.startsWith("//"))
            return;

        str = str.trim().toLowerCase();
        Set<String> langs = words.get(str);
        if (langs == null)
            langs = new LinkedHashSet<String>(10);

        langs.addAll(entry.getValue());
        words.put(str, langs);
    }

    /**
     * specifies how many existing tweets with similar content were found
     */
    public Collection<Long> getDuplicates() {
        return duplicates;
    }

    public void addDuplicate(long twId) {
        duplicates.add(twId);
    }

    public int getQueueAgeInSeconds() {
        return Math.round((System.currentTimeMillis() - instantiatedAt.getTime()) / 1000f);
    }

    public JTweet setFeedSource(String feedSource) {
        this.feedSource = feedSource;
        return this;
    }

    public String getFeedSource() {
        return feedSource;
    }

    @Override
    public String getId() {
        return Long.toString(getTwitterId());
    }

    public JTweet setGeoLocation(double lat, double lon) {
        latitude = lat;
        longitude = lon;
        return this;
    }

    /**
     * @return latitude
     */
    public double getLat() {
        return latitude;
    }

    /**
     * @return longitude
     */
    public double getLon() {
        return longitude;
    }

    /**
     * This method specifies how this tweet should get updated from a
     * tweet - either an out-of-date tweet fetched from index or otherway around
     */
    @Override
    public JTweet updateFrom(JTweet a) {
        if (!getId().equals(a.getId()))
            throw new IllegalStateException("ids have to be the same to call update. This:" + this + " update:" + a);

        if (getRetweetCount() > a.getRetweetCount())
            return this;

        setReplyCount(a.replyCount);
        setRetweetCount(a.retweetCount);
        replies.clear();
        for (JTweet repl : a.replies) {
            addReply(repl);
        }
        duplicates.clear();
        for (Long val : a.getDuplicates()) {
            addDuplicate(val);
        }
        return this;
    }

    public JTweet setProtected(boolean aProtected) {
        isProtected = aProtected;
        return this;
    }

    public boolean isProtected() {
        return isProtected;
    }

    public String getUrl() {
        if (getUrlEntries() == null || getUrlEntries().isEmpty())
            return null;

        return getUrlEntries().iterator().next().getResolvedUrl();
    }
}
TOP

Related Classes of de.jetwick.data.JTweet

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.