Package de.jetwick.es

Source Code of de.jetwick.es.ElasticTweetSearch

/*
*  Copyright 2010 Peter Karich jetwick_@_pannous_._info
*
*  Licensed under the Apache License, Version 2.0 (the "License");
*  you may not use this file except in compliance with the License.
*  You may obtain a copy of the License at
*
*       http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package de.jetwick.es;

import java.util.regex.Pattern;
import de.jetwick.util.MyDate;
import org.elasticsearch.search.facet.filter.FilterFacet;
import org.elasticsearch.search.SearchHits;
import de.jetwick.config.Configuration;
import de.jetwick.data.UrlEntry;
import de.jetwick.data.JTweet;
import de.jetwick.data.JUser;
import de.jetwick.tw.Extractor;
import de.jetwick.tw.cmd.SerialCommandExecutor;
import de.jetwick.tw.cmd.TermCreateCommand;
import de.jetwick.util.AnyExecutor;
import de.jetwick.util.Helper;
import de.jetwick.util.MapEntry;
import de.jetwick.util.StopWatch;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.unit.DistanceUnit;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.json.JsonXContent;
import org.elasticsearch.index.query.FilterBuilder;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.index.query.GeoDistanceFilterBuilder;
import org.elasticsearch.index.query.NotFilterBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.index.query.RangeFilterBuilder;
import org.elasticsearch.index.search.geo.GeoDistance;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.facet.Facet;
import org.elasticsearch.search.facet.FacetBuilders;
import org.elasticsearch.search.facet.Facets;
import org.elasticsearch.search.facet.query.QueryFacet;
import org.elasticsearch.search.facet.terms.TermsFacet;
import org.elasticsearch.search.facet.terms.TermsFacetBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Provides search functionality via elasticsearch.
*
* @author Peter Karich, jetwick_@_pannous_._info
*/
public class ElasticTweetSearch extends AbstractElasticSearchQueueEnabled<JTweet> {

    public static final long OLDEST_DT_IN_MILLIS = 4 * 24 * MyDate.ONE_HOUR;
    public static final String TITLE = "dest_title_t";
    public static final String TWEET_TEXT = "tw";
    public static final String DATE = "dt";
    public static final String DATE_FACET = "datefacet";
    public static final String RT_COUNT = "retw_i";
    public static final String DUP_COUNT = "dups_i";
    public static final String IS_RT = "crt_b";
    public static final String UPDATE_DT = "update_dt";
    public static final String TAG = "tag";
    public static final String INREPLY_ID = "inreply_l";
    public static final String QUALITY = "quality_i";
    public static final String LANG = "lang";
    public static final String URL_COUNT = "url_i";
    public static final String FIRST_URL_TITLE = "dest_title_1_s";
    public static final String USER = "user";
    public static final String FILTER_NO_DUPS = DUP_COUNT + ":0";
    public static final String FILTER_ONLY_DUPS = DUP_COUNT + ":[1 TO *]";
    public static final String FILTER_NO_URL_ENTRY = URL_COUNT + ":0";
    public static final String FILTER_URL_ENTRY = URL_COUNT + ":[1 TO *]";
    public static final String FILTER_NO_SPAM = QUALITY + ":[" + (JTweet.QUAL_SPAM + 1) + " TO *]";
    public static final String FILTER_SPAM = QUALITY + ":[* TO " + JTweet.QUAL_SPAM + "]";
    public static final String RELEVANCE = "relevance";
    public static final String _ID = "_id_";
    private String indexName = "twindex";
    private List<AnyExecutor<JTweet>> commitListener = new ArrayList<AnyExecutor<JTweet>>(1);
    private Logger logger = LoggerFactory.getLogger(getClass());

    public ElasticTweetSearch() {
    }

    public ElasticTweetSearch(Configuration config) {
        this(config.getTweetSearchUrl());
    }

    public ElasticTweetSearch(String url) {
        super(url);
    }

    public ElasticTweetSearch(Client client) {
        super(client);
    }

    @Override
    public String getIndexName() {
        return indexName;
    }

    @Override
    public void setIndexName(String indexName) {
        this.indexName = indexName;
    }

    @Override
    public String getIndexType() {
        return "tweet";
    }

    Client getClient() {
        return client;
    }

    public void deleteUntil(Date removeUntil) {
        logger.info("Deleting tweets older than " + removeUntil);
        NotFilterBuilder notPersistentFilter = FilterBuilders.notFilter(FilterBuilders.existsFilter(UPDATE_DT));
        FilterBuilder fewRetweetsFilter = FilterBuilders.rangeFilter(RT_COUNT).lt(100).includeUpper(false);
        RangeFilterBuilder tooOldFilter = FilterBuilders.rangeFilter(DATE);
        tooOldFilter.lte(removeUntil);
        FilterBuilder filter = FilterBuilders.andFilter(tooOldFilter,
                notPersistentFilter, fewRetweetsFilter);

        client.prepareDeleteByQuery(getIndexName()).setTypes(getIndexType()).
                setQuery(QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), filter)).
                execute().
                actionGet();
    }

    public void delete(Collection<JTweet> tws) {
        if (tws.isEmpty())
            return;

        try {
            for (JTweet tw : tws) {
                deleteById(tw.getId());
            }

        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    @Override
    public XContentBuilder createDoc(JTweet tw) throws IOException {
        if (tw.getFromUser() == null) {
            // this came from UpdateResult.addNewTweet(tweet1); UpdateResult.addRemovedTweet(tweet1) at the same time
            // but should be fixed via if (!removedTweets.contains(tweet)) newTweets.add(tweet);
            logger.error("fromUser of tweet must not be null:" + tw.getTwitterId() + " " + tw.getText());
            return null;
        }

        // daemon tweets have no known twitterId and no known createdAt date
        if (tw.isDaemon())
            return null;

        XContentBuilder b = JsonXContent.contentBuilder().startObject();
        b.field(TWEET_TEXT, tw.getText());
        b.field("tw_i", tw.getText().length());
        b.field(UPDATE_DT, tw.getUpdatedAt());
        b.field(DATE, tw.getCreatedAt());
        b.field(IS_RT, tw.isRetweet());

        if (tw.getLocation() == null)
            b.field("loc", tw.getFromUser().getLocation());
        else
            b.field("loc", tw.getLocation());

        b.field("geo", tw.getLat() + "," + tw.getLon());

        if (!JTweet.isDefaultInReplyId(tw.getInReplyTwitterId()))
            b.field(INREPLY_ID, tw.getInReplyTwitterId());

        b.field(USER, tw.getFromUser().getScreenName());
        b.field("iconUrl", tw.getFromUser().getProfileImageUrl());

        double relevancy = tw.getCreatedAt().getTime() / MyDate.ONE_HOUR;
        // every 14 retweets boosts the tweet one hour further
        float scale = 14;
        if (tw.getRetweetCount() <= 100)
            relevancy += tw.getRetweetCount() / scale;
        else
            relevancy += 100 / scale;
        if (tw.getText().length() <= 30)
            relevancy *= 0.5;
        if (tw.getQuality() <= 65)
            relevancy *= 0.5;
        b.field(RELEVANCE, relevancy);

        for (Entry<String, Integer> entry : tw.getTextTerms().entrySet()) {
            b.field(TAG, entry.getKey());
        }
       
        int counter = 0;
        for (UrlEntry urlEntry : tw.getUrlEntries()) {
            counter++;
            b.field("orig_url_" + counter + "_s", urlEntry.getOriginalUrl(tw));
            b.field("url_pos_" + counter + "_s", urlEntry.getIndex() + "," + urlEntry.getLastIndex());
            b.field("dest_url_" + counter + "_s", urlEntry.getResolvedUrl());
            if (!Helper.isEmpty(urlEntry.getResolvedDomain()))
                b.field("dest_domain_" + counter + "_s", urlEntry.getResolvedDomain());

            if (!Helper.isEmpty(urlEntry.getResolvedDomain()))
                b.field("dest_title_" + counter + "_s", urlEntry.getResolvedTitle());

            if (counter == 1)
                b.field(TITLE, urlEntry.getResolvedTitle());

            if (counter >= 3)
                break;
        }

        b.field(URL_COUNT, counter);
        b.field(DUP_COUNT, tw.getDuplicates().size());
        b.field(LANG, tw.getLanguage());
        b.field(QUALITY, tw.getQuality());
        b.field("repl_i", tw.getReplyCount());
        b.field(RT_COUNT, tw.getRetweetCount());

        b.endObject();
        return b;
    }

    @Override
    public JTweet readDoc(String idAsStr, long version, Map<String, Object> source) {
        // if we use in mapping: "_source" : {"enabled" : false}
        // we need to include all fields in query to use doc.getFields()
        // instead of doc.getSource()

        String name = (String) source.get(USER);
        String text = (String) source.get(TWEET_TEXT);
        if (text == null || name == null || idAsStr == null) {
            logger.error("Null tweet text or id!!!??" + idAsStr + " " + name + " " + text);
            return new JTweet(-1L, "", new JUser(""));
        }

        JUser user = new JUser(name);
        user.setLocation((String) source.get("loc"));
        user.setProfileImageUrl((String) source.get("iconUrl"));

        long id = Long.parseLong(idAsStr);
        JTweet tw = new JTweet(id, text, user);
        tw.setVersion(version);

        String p = (String) source.get("geo");
        if (p != null)
            try {
                String[] strs = p.split(",");
                double lat = Double.parseDouble(strs[0]);
                double lon = Double.parseDouble(strs[1]);
                tw.setGeoLocation(lat, lon);
            } catch (Exception ex) {
            }

        tw.setCreatedAt(Helper.toDateNoNPE((String) source.get(DATE)));
        tw.setUpdatedAt(Helper.toDateNoNPE((String) source.get(UPDATE_DT)));
        int rt = ((Number) source.get(RT_COUNT)).intValue();
        int rp = ((Number) source.get("repl_i")).intValue();
        tw.setRetweetCount(rt);
        tw.setReplyCount(rp);

        if (source.get(QUALITY) != null)
            tw.setQuality(((Number) source.get(QUALITY)).intValue());

        tw.setLanguage((String) source.get(LANG));

        if (source.get(INREPLY_ID) != null) {
            long replyId = ((Number) source.get(INREPLY_ID)).longValue();
            tw.setInReplyTwitterId(replyId);
        }

        tw.setUrlEntries(Arrays.asList(parseUrlEntries(source)));
        return tw;
    }

    public UrlEntry[] parseUrlEntries(Map<String, Object> source) {
        int urlCount = 0;
        try {
            urlCount = ((Number) source.get(URL_COUNT)).intValue();
        } catch (Exception ex) {
        }

        if (urlCount == 0)
            return new UrlEntry[0];

        UrlEntry urls[] = new UrlEntry[urlCount];
        for (int i = 0; i < urls.length; i++) {
            urls[i] = new UrlEntry();
        }

        for (int counter = 0; counter < urls.length; counter++) {
            String str = (String) source.get("url_pos_" + (counter + 1) + "_s");
            String strs[] = (str).split(",");
            urls[counter].setIndex(Integer.parseInt(strs[0]));
            urls[counter].setLastIndex(Integer.parseInt(strs[1]));
        }

        for (int counter = 0; counter < urls.length; counter++) {
            String str = (String) source.get("dest_url_" + (counter + 1) + "_s");
            urls[counter].setResolvedUrl(str);
        }

        for (int counter = 0; counter < urls.length; counter++) {
            String str = (String) source.get("dest_domain_" + (counter + 1) + "_s");
            urls[counter].setResolvedDomain(str);
        }

        for (int counter = 0; counter < urls.length; counter++) {
            String str = (String) source.get("dest_title_" + (counter + 1) + "_s");
            urls[counter].setResolvedTitle(str);
        }
        return urls;
    }

    /**
     * Find a reason for a (trending) topic
     * 1. first query via q=topic
     * 2. retweet count should be high enough (not too high to have no results)
     *    but not too low (avoid noise) -> use facets with more fine grained buckets
     *    and determine the correct filterquery!
     * 3. return created solrquery (added sort 'oldest'!)
     */
    public JetwickQuery createFindOriginQuery(JetwickQuery oldQuery, String tag, int minResults) {
        if (tag.isEmpty())
            return new TweetQuery("");

        try {
            JetwickQuery q;
            if (oldQuery == null)
                q = new TweetQuery(tag);
            else
                q = oldQuery.getCopy().setQuery(tag);

            // copy current state of q into resQuery!
            JetwickQuery resQuery = q.getCopy();

            // more fine grained information about retweets
            Map<String, Integer> orderedFQ = new LinkedHashMap<String, Integer>();
            orderedFQ.put("[16 TO *]", 16);
            orderedFQ.put("[11 TO 15]", 11);
            orderedFQ.put("[6 TO 10]", 6);
            orderedFQ.put("[1 TO 5]", 1);
            orderedFQ.put("0", 0);

            q.setSize(0).addFilterQuery(IS_RT, false);
            for (String facQ : orderedFQ.keySet()) {
                q.addFacetQuery(RT_COUNT, facQ);
            }

            SearchResponse rsp = query(q);
            long results = rsp.getHits().getTotalHits();
            if (results == 0)
                return new TweetQuery(tag);

            resQuery.addFilterQuery(IS_RT, false);
            resQuery.setSort(DATE, "asc");

            long counter = 0;
            for (Entry<String, Integer> entry : orderedFQ.entrySet()) {
                FilterFacet ff = rsp.getFacets().facet(RT_COUNT + ":" + entry.getKey());
//                System.out.println("facets:" + ff.count());
                counter += ff.count();
                if (counter >= minResults) {
                    if (entry.getValue() > 0)
                        resQuery.addFilterQuery(RT_COUNT, "[" + entry.getValue() + " TO *]");
                    break;
                }
            }

            return resQuery;//.attachFacetibility();
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    Collection<JUser> search(String str) {
        List<JUser> user = new ArrayList<JUser>();
        query(user, new TweetQuery(str));
        return user;
    }

    @Override
    public SearchResponse query(JetwickQuery query) {
        return query(new ArrayList(), query);
    }

    public SearchResponse query(Collection<JUser> users, JetwickQuery query) {
        return query(users, super.query(query));
    }

    public SearchResponse query(Collection<JUser> users, SearchResponse rsp) {
        SearchHit[] docs = rsp.getHits().getHits();
        Map<String, JUser> usersMap = new LinkedHashMap<String, JUser>();
        for (SearchHit sd : docs) {
//            System.out.println(sd.getExplanation().toString());
            JUser u = readDoc(sd.getId(), sd.getVersion(), sd.getSource()).getFromUser();
            JUser uOld = usersMap.get(u.getScreenName());
            if (uOld == null)
                usersMap.put(u.getScreenName(), u);
            else
                uOld.addOwnTweet(u.getOwnTweets().iterator().next());
        }

        users.addAll(usersMap.values());
        return rsp;
    }

    public Collection<JTweet> searchReplies(long id, boolean retweet) {
        try {
            JetwickQuery sq = new TweetQuery(true).addFilterQuery("crt_b", retweet).addFilterQuery(INREPLY_ID, id);
            SearchResponse rsp = query(sq);
            return collectObjects(rsp);
        } catch (Exception ex) {
            logger.error("Error while searchReplies", ex);
            return Collections.EMPTY_SET;
        }
    }

    void testUpdate(JTweet tmpTweets) {
        queueObject(tmpTweets);
        forceEmptyQueueAndRefresh();
    }

    void testUpdate(Collection<JTweet> tmpTweets) {
        queueObjects(tmpTweets);
        forceEmptyQueueAndRefresh();
    }

    /**
     * Updates a list of tweet's with its replies and retweets.
     *
     * @param tmpTweets
     * @param removeUntil the date until all old tweet should be removed
     * @param performDelete avoid too frequent removing!    
     * @return updated tweets
     */
    public Collection<JTweet> update(Collection<JTweet> tmpTweets, Date removeUntil, boolean performDelete) {
        try {
            Map<String, JUser> usersMap = new LinkedHashMap<String, JUser>();
            Map<Long, JTweet> existingTweets = new LinkedHashMap<Long, JTweet>();
            StringBuilder idStr = new StringBuilder();
            int counts = 0;
            // we can add max ~150 tweets per request (otherwise the webcontainer won't handle the long request)
            for (JTweet tw : tmpTweets) {
                if (counts > 0)
                    idStr.append(" OR ");
                counts++;
                idStr.append(tw.getTwitterId());
            }

            // get existing tweets and users               
            JetwickQuery query = new TweetQuery().addFilterQuery(_ID + getIndexType(), idStr.toString()).setSize(counts);
            SearchResponse rsp = query(query);
            SearchHits docs = rsp.getHits();

            for (SearchHit sd : docs) {
                JTweet tw = readDoc(sd.getId(), sd.getVersion(), sd.getSource());
                existingTweets.put(tw.getTwitterId(), tw);
                JUser u = tw.getFromUser();
                JUser uOld = usersMap.get(u.getScreenName());
                if (uOld == null)
                    usersMap.put(u.getScreenName(), u);
                else
                    uOld.addOwnTweet(u.getOwnTweets().iterator().next());
            }

            // Avoid storing existing tweets again
            Map<Long, JTweet> twMap = new LinkedHashMap<Long, JTweet>();
            for (JTweet tmpTweet : tmpTweets) {
                // do not store if too old
                if (!tmpTweet.isPersistent() && tmpTweet.getCreatedAt().getTime() < removeUntil.getTime())
                    continue;

                JTweet exTw = existingTweets.get(tmpTweet.getTwitterId());
                // feed if new or if it should be persistent
                if (exTw == null || tmpTweet.isPersistent()) {
                    String name = tmpTweet.getFromUser().getScreenName();
                    JUser u = usersMap.get(name);
                    if (u == null) {
                        u = tmpTweet.getFromUser();
                        usersMap.put(name, u);
                    }

                    u.addOwnTweet(tmpTweet);
                    // tweet does not exist. so store it into the todo map
                    twMap.put(tmpTweet.getTwitterId(), tmpTweet);

                    // overwrite existing tweets if persistent BUT update version
                    if (tmpTweet.isPersistent() && exTw != null)
                        tmpTweet.setVersion(exTw.getVersion());
                }
            }

            LinkedHashSet<JTweet> updateTweets = new LinkedHashSet<JTweet>(twMap.values());
            updateTweets.addAll(findReplies(twMap));
            updateTweets.addAll(findRetweets(twMap, usersMap));
            updateTweets.addAll(findDuplicates(twMap));

            // add the additionally fetched tweets to the user but do not add to updateTweets
            // this is a bit expensive ~30-40sec for every store call on a large index!
//            fetchMoreTweets(twMap, usersMap);           
            store(updateTweets, false);

            // We are not receiving the deleted tweets! but do we need to
            // store the tweets where this deleted tweet was a retweet?
            // No. Because "userA: text" and "userB: RT @usera: text" now the second tweet is always AFTER the first!
            if (performDelete) {
                logger.info("Deleting tweets older than " + removeUntil);
                deleteUntil(removeUntil);
            }

            return updateTweets;
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }
    private StopWatch sw1 = new StopWatch();
    private StopWatch sw2 = new StopWatch();
    private StopWatch sw3 = new StopWatch();
    private StopWatch sw4 = new StopWatch();

    void store(Collection<JTweet> tweets, boolean refresh) {
        try {
            if (tweets.isEmpty())
                return;

            tweets = new SerialCommandExecutor(tweets).add(
                    new TermCreateCommand().setSw1(sw1).setSw2(sw2).setSw3(sw3).setSw4(sw4)).execute();

            List<JTweet> list = new ArrayList<JTweet>(tweets);
            Collection<Integer> failedArticleIndices = bulkUpdate(list, getIndexName());
            for (Integer integ : failedArticleIndices) {
                JTweet tw = list.get(integ);
                tw.setUpdateCount(tw.getUpdateCount() + 1);
                if (tw.getUpdateCount() > 10)
                    logger.warn("PROBLEM: skipped tweet. it failed " + tw.getUpdateCount() + " times:" + tw);
                else
                    queueFailedObject(tw);
            }
        } catch (Exception e) {
            logger.error("Exception while updating.", e);
        }
    }

    /**
     * For every user there should be at least 5 tweets to make spam detection
     * more efficient
     */
    public void fetchMoreTweets(Map<Long, JTweet> tweets, final Map<String, JUser> userMap) {
        for (JUser us : userMap.values()) {
            // guarantee 5 tweets to be in the cache
            if (us.getOwnTweets().size() > 4)
                continue;

            //  fetch 10 tweets if less than 5 tweets are in the cache           
            JetwickQuery query = new TweetQuery().addFilterQuery("user", us.getScreenName()).setSize(10);
            try {
                SearchResponse rsp = query(query);
                SearchHits docs = rsp.getHits();
                for (SearchHit sd : docs) {
                    JTweet tw = readDoc(sd.getId(), sd.getVersion(), sd.getSource());
                    JTweet twOld = tweets.get(tw.getTwitterId());
                    if (twOld == null)
                        us.addOwnTweet(tw);
                }
            } catch (Exception ex) {
                throw new RuntimeException(ex);
            }
        }
    }

    /**
     * Connect tweets via its retweet text
     *
     * @return all tweets which should be updated
     */
    public Collection<JTweet> findRetweets(Map<Long, JTweet> tweets, final Map<String, JUser> userMap) {
        // 1. check if tweets contains originals which were retweeted -> only done for 'tweets'
        // 2. check if tweets contains retweets -> done for 'tweets' and for tweets in solr

        final Set<JTweet> updatedTweets = new LinkedHashSet<JTweet>();
        Extractor extractor = new Extractor() {

            @Override
            public boolean onNewUser(int index, String user) {
                boolean isRetweet = index >= 3 && text.substring(index - 3, index).equalsIgnoreCase("rt ");
                if (isRetweet) {
                    user = user.toLowerCase();
                    JUser existingUser = userMap.get(user);
                    JTweet resTw = null;

                    // check ifRetweetOf against local tweets
                    if (existingUser != null)
                        for (JTweet tmp : existingUser.getOwnTweets()) {
                            if (tmp.getCreatedAt().getTime() < tweet.getCreatedAt().getTime()
                                    && tweet.isRetweetOf(tmp)) {
                                if (addReplyNoTricks(tmp, tweet)) {
                                    resTw = tmp;
                                    break;
                                }
                            }
                        }

                    // check ifRetweetOf against tweets existing in index
                    if (resTw == null)
                        resTw = connectToOrigTweet(tweet, user);

                    if (resTw != null) {
                        updatedTweets.add(resTw);
                        return false;
                    }
                }

                // break loop of Extractor because we only need the first user!
                return true;
            }
        };

        for (JTweet tw : tweets.values()) {
            if (tw.isRetweet()) {
                extractor.setTweet(tw).run();
            }
        }
        return updatedTweets;
    }

    /**
     * add relation to existing/original tweet
     */
    public JTweet connectToOrigTweet(JTweet tw, String toUserStr) {
        if (tw.isRetweet()) {
            // do not connect if retweeted user == user who retweets 
            if (toUserStr.equals(tw.getFromUser().getScreenName()))
                return null;

            try {
                // connect retweets to tweets only searchTweetsDays old
                SearchResponse rsp = query(new TweetQuery(JetwickQuery.escapeQuery(tw.extractRTText())).addFilterQuery(USER, toUserStr).
                        addFilterQuery(IS_RT, false).
                        setSize(10));
                List<JTweet> existingTw = collectObjects(rsp);
                for (JTweet tmp : existingTw) {
                    boolean isRetweet = tw.isRetweetOf(tmp);
                    if (isRetweet) {
                        boolean check = addReplyNoTricks(tmp, tw);
                        if (check)
                            return tmp;
                    }
                }
            } catch (Exception ex) {
                logger.error("couldn't connect tweet to orig tweet:" + ex.getMessage());
            }
        }
        return null;
    }

    /**
     * Connect tweets via its inReplyId
     *
     * @return all tweets which should be updated
     */
    public Collection<JTweet> findReplies(Map<Long, JTweet> tweets) {
        Set<JTweet> updatedTweets = new LinkedHashSet<JTweet>();
        Map<Long, JTweet> replyMap = new LinkedHashMap<Long, JTweet>();
        for (JTweet tw : tweets.values()) {
            if (!JTweet.isDefaultInReplyId(tw.getInReplyTwitterId()) && !tw.isRetweet())
                replyMap.put(tw.getInReplyTwitterId(), tw);
        }

        Iterator<JTweet> iter = tweets.values().iterator();
        findRepliesInBatch(iter, tweets, replyMap, updatedTweets);

        return updatedTweets;
    }

    protected void findRepliesInBatch(Iterator<JTweet> iter, Map<Long, JTweet> origTweets,
            Map<Long, JTweet> replyIdToTweetMap, Collection<JTweet> updatedTweets) {
        int counter = 0;
        StringBuilder idStr = new StringBuilder();
        StringBuilder replyIdStr = new StringBuilder();
        while (iter.hasNext()) {
            JTweet tw = iter.next();
            JTweet tmp = replyIdToTweetMap.get(tw.getTwitterId());
            if (tmp != null) {
                if (addReplyNoTricks(tw, tmp)) {
                    updatedTweets.add(tw);
                    updatedTweets.add(tmp);
                }
            } else {
                if (replyIdStr.length() > 0)
                    replyIdStr.append(" OR ");

                replyIdStr.append(tw.getTwitterId());
            }

            if (JTweet.isDefaultInReplyId(tw.getInReplyTwitterId()))
                continue;

            tmp = origTweets.get(tw.getInReplyTwitterId());
            if (tmp != null) {
                if (addReplyNoTricks(tmp, tw)) {
                    updatedTweets.add(tw);
                    updatedTweets.add(tmp);
                }
            } else {
                counter++;
                if (idStr.length() > 0)
                    idStr.append(" OR ");

                idStr.append(tw.getInReplyTwitterId());
            }
        }

        try {
            // get tweets which replies our input tweets
            // INREPLY_ID:"tweets[i].id"           
            if (replyIdStr.length() > 0) {
                JetwickQuery query = new TweetQuery().addFilterQuery(INREPLY_ID, replyIdStr.toString()).setSize(origTweets.size());
                findRepliesForOriginalTweets(query, origTweets, updatedTweets);
            }

            // get original tweets where we have replies           
            if (idStr.length() > 0) {
                JetwickQuery query = new TweetQuery().addFilterQuery(_ID + getIndexType(), idStr.toString()).setSize(counter);
                selectOriginalTweetsWithReplies(query, origTweets.values(), updatedTweets);
            }
        } catch (Exception ex) {
            logger.error("couldn't find replies in a batch query", ex);
        }
    }

    protected void findRepliesForOriginalTweets(JetwickQuery query, Map<Long, JTweet> tweets,
            Collection<JTweet> updatedTweets) {

        Map<Long, JTweet> replyMap = new LinkedHashMap<Long, JTweet>();
        SearchResponse rsp = query(query);
        SearchHits docs = rsp.getHits();

        for (SearchHit sd : docs) {
            JTweet tw = readDoc(sd.getId(), sd.getVersion(), sd.getSource());
            replyMap.put(tw.getTwitterId(), tw);
        }

        for (JTweet inReplSolrTweet : replyMap.values()) {
            if (JTweet.isDefaultInReplyId(inReplSolrTweet.getInReplyTwitterId()))
                continue;
            JTweet origTw = tweets.get(inReplSolrTweet.getInReplyTwitterId());
            if (origTw != null && addReplyNoTricks(origTw, inReplSolrTweet)) {
                updatedTweets.add(origTw);
                updatedTweets.add(inReplSolrTweet);
            }
        }
    }

    protected void selectOriginalTweetsWithReplies(JetwickQuery query, Collection<JTweet> tweets,
            Collection<JTweet> updatedTweets) {

        SearchResponse rsp = query(query);
        SearchHits docs = rsp.getHits();
        Map<Long, JTweet> origMap = new LinkedHashMap<Long, JTweet>();
        for (SearchHit sd : docs) {
            JTweet tw = readDoc(sd.getId(), sd.getVersion(), sd.getSource());
            origMap.put(tw.getTwitterId(), tw);
        }

        if (origMap.size() > 0)
            for (JTweet inReplSolrTweet : tweets) {
                if (JTweet.isDefaultInReplyId(inReplSolrTweet.getInReplyTwitterId()))
                    continue;
                JTweet origTw = origMap.get(inReplSolrTweet.getInReplyTwitterId());
                if (origTw != null && addReplyNoTricks(origTw, inReplSolrTweet)) {
                    updatedTweets.add(origTw);
                    updatedTweets.add(inReplSolrTweet);
                }
            }
    }

    public boolean addReplyNoTricks(JTweet orig, JTweet reply) {
        if (orig.getFromUser().equals(reply.getFromUser()))
            return false;

        try {
            // ensure that reply.user has not already a tweet in orig.replies  
            JetwickQuery q = new TweetQuery().addFilterQuery(INREPLY_ID, orig.getTwitterId()).
                    addFilterQuery("-" + _ID + getIndexType(), reply.getTwitterId()).
                    addFilterQuery("user", reply.getFromUser().getScreenName());
            if (query(q).getHits().getTotalHits() > 0)
                return false;

            orig.addReply(reply);
            return true;
        } catch (Exception ex) {
            logger.error("couldn't add reply to:" + orig, ex);
            return false;
        }
    }

    /**
     * @param exec will be called directly after the tweets have beed feeded
     * into the index. WARNING: it is not guarantueed that the tweets are
     * already searchable as every index has a realtime latency
     */
    public void addListener(AnyExecutor<JTweet> exec) {
        if (!commitListener.contains(exec))
            commitListener.add(exec);
    }

    public void removeListener(AnyExecutor<JTweet> exec) {
        commitListener.remove(exec);
    }

    public JTweet findByTwitterId(Long twitterId) {
        try {
            GetResponse rsp = client.prepareGet(getIndexName(), getIndexType(), Long.toString(twitterId)).
                    execute().actionGet();
            if (rsp.getSource() == null)
                return null;
            return readDoc(rsp.getId(), rsp.getVersion(), rsp.getSource());
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    public Collection<String> getUserChoices(JetwickQuery lastQ, String input) {
        try {
            if (input.length() < 1)
                return Collections.emptyList();

            // NOT context dependent any longer ...                       
            input = input.toLowerCase();
            SearchRequestBuilder srb = createSearchBuilder();
            srb.setQuery(QueryBuilders.fieldQuery(USER, input + "*"));
            List<JUser> users = new ArrayList<JUser>();
            query(users, new TweetQuery(false));
            Set<String> res = new TreeSet<String>();
            for (JUser u : users) {
                if (u.getScreenName().startsWith(input))
                    res.add(u.getScreenName());

                if (res.size() > 9)
                    break;
            }

            return res;
        } catch (Exception ex) {
            logger.error("Error while getUserChoices:" + input + " " + lastQ, ex);
            return Collections.emptyList();
        }
    }

    public Collection<String> getQueryChoices(JetwickQuery lastQ, String input) {
        try {
            if (input.length() < 2)
                return Collections.emptyList();

            String firstPart = "";
            String secPart = input;
            int index = input.lastIndexOf(" ");
            Set<String> existingTerms = new HashSet<String>();
            if (index > 0 && index < input.length()) {
                firstPart = input.substring(0, index);
                secPart = input.substring(index + 1);
                for (String tmp : input.split(" ")) {
                    existingTerms.add(tmp.toLowerCase().trim());
                }
            } else
                existingTerms.add(secPart);

            if (lastQ == null) {
                lastQ = new TweetQuery(firstPart, false);
            } else {
                lastQ = lastQ.getCopy().setQuery(firstPart);
                // remove any date restrictions
                lastQ.removeFilterQueries(DATE);
                lastQ.removeFacets();
            }

            SearchRequestBuilder srb = createSearchBuilder();
            lastQ.initRequestBuilder(srb);

            TermsFacetBuilder tfb = FacetBuilders.termsFacet(TAG).field(TAG);
            if (!secPart.trim().isEmpty())
                tfb.regex(secPart + ".*", Pattern.DOTALL);

            srb.addFacet(tfb);
            SearchResponse rsp = query(new ArrayList<JUser>(), srb.execute().actionGet());
            Set<String> res = new TreeSet<String>();
            TermsFacet tf = rsp.facets().facet(TAG);
            if (tf != null) {
                for (TermsFacet.Entry cnt : tf.entries()) {
                    String lowerSugg = cnt.getTerm().toLowerCase();
                    if (existingTerms.contains(lowerSugg))
                        continue;

                    if (lowerSugg.startsWith(secPart)) {
                        if (firstPart.isEmpty())
                            res.add(cnt.getTerm());
                        else
                            res.add(firstPart + " " + cnt.getTerm());
                    }

                    if (res.size() > 9)
                        break;
                }
            }

            return res;
        } catch (Exception ex) {
            logger.error("Error while getQueryChoices:" + input + " " + lastQ + " -> Error:" + ex.getMessage());
            return Collections.emptyList();
        }
    }

    JUser findByUserName(String uName) {
        try {
            List<JUser> list = new ArrayList<JUser>();
            // get all tweets of the user so set rows large ...           
            query(list, new TweetQuery().addFilterQuery("user", uName.toLowerCase()).setSize(10));

            if (list.isEmpty())
                return null;

            return list.get(0);
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    public List<JTweet> searchTweets(JetwickQuery q) {
        try {
            return collectObjects(query(q));
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    public Collection<String> searchTrends(JetwickQuery q, int limit) {
        try {
            q.addFacetField(TAG);
            SearchResponse rsp = query(q);
            Facets facets = rsp.facets();
            if (facets == null)
                return Collections.emptyList();

            Set<String> set = new LinkedHashSet<String>();
            for (Facet facet : facets.facets()) {
                if (facet instanceof TermsFacet) {
                    TermsFacet ff = (TermsFacet) facet;
                    for (TermsFacet.Entry e : ff.entries()) {
                        if (e.count() > limit)
                            set.add(e.getTerm());
                    }
                }
            }
            return set;
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    public String getTweetsAsString(JetwickQuery q, String separator) {
        StringBuilder sb = new StringBuilder();
        List<JTweet> tmpTweets = searchTweets(q);
        for (JTweet tweet : tmpTweets) {
            sb.append(Helper.toTwitterHref(tweet.getFromUser().getScreenName(), tweet.getTwitterId()));
            sb.append(separator);
            sb.append(tweet.getRetweetCount());
            sb.append(separator);
            sb.append(tweet.getText().replaceAll("\n", " "));
            sb.append("\n");
        }

        return sb.toString();
    }

    public Collection<JTweet> findDuplicates(Map<Long, JTweet> tweets) {
        final Set<JTweet> updatedTweets = new LinkedHashSet<JTweet>();
        TermCreateCommand termCommand = new TermCreateCommand();
        double JACC_BORDER = 0.7;
        for (JTweet currentTweet : tweets.values()) {
            if (currentTweet.isRetweet())
                continue;

            JetwickQuery reqBuilder = new SimilarTweetQuery(currentTweet, false).addLatestDateFilter(24);
            if (currentTweet.getTextTerms().size() < 3)
                continue;

            int dups = 0;
            try {
                // find dups in index
                for (JTweet simTweet : collectObjects(query(reqBuilder))) {
                    if (simTweet.getTwitterId().equals(currentTweet.getTwitterId()))
                        continue;

                    termCommand.calcTermsWithoutNoise(simTweet);
                    if (TermCreateCommand.calcJaccardIndex(currentTweet.getTextTerms(), simTweet.getTextTerms())
                            >= JACC_BORDER) {
                        currentTweet.addDuplicate(simTweet.getTwitterId());
                        dups++;
                    }
                }
            } catch (Exception ex) {
                logger.error("Error while findDuplicate query execution", ex);
            }

            // find dups in tweets map
            for (JTweet simTweet : tweets.values()) {
                if (simTweet.getTwitterId().equals(currentTweet.getTwitterId()) || simTweet.isRetweet())
                    continue;

                if (currentTweet.getCreatedAt().getTime() < simTweet.getCreatedAt().getTime())
                    continue;

                termCommand.calcTermsWithoutNoise(simTweet);
                if (TermCreateCommand.calcJaccardIndex(currentTweet.getTextTerms(), simTweet.getTextTerms())
                        >= JACC_BORDER) {
                    currentTweet.addDuplicate(simTweet.getTwitterId());
                    dups++;
                }
            }

//            tw.setDuplicates(dups);
        }

        return updatedTweets;
    }

    public SearchResponse updateSavedSearches(final Collection<SavedSearch> savedSearches) {
        JetwickQuery q = new TweetQuery() {

            @Override
            protected void processFacetQueries(SearchRequestBuilder srb) {
                for (SavedSearch ss : savedSearches) {
                    srb.addFacet(FacetBuilders.queryFacet(SAVED_SEARCHES + "_" + ss.getId(),
                            createQSQB(ss.calcFacetQuery())));
                }
            }
        }.setFrom(0).setSize(0);

        return query(q);
    }

    QueryStringQueryBuilder createQSQB(String qStr) {
        return QueryBuilders.queryString(qStr).
                useDisMax(true).defaultOperator(QueryStringQueryBuilder.Operator.AND).
                field(ElasticTweetSearch.TWEET_TEXT).field(TITLE).field(USER, 0);
    }

    /**
     * @return a collection where the first string indicates the filter key
     * which should be removed to increase the number of results.
     * Of course this can be only a heuristic sorting against the count of each
     * filter query
     */
    public Collection<String> suggestRemoval(final JetwickQuery q) {
        SearchResponse rsp = query(new TweetQuery() {

            @Override
            protected void processFacetQueries(SearchRequestBuilder srb) {
                int counter = 0;
                String initFacetQ = SavedSearch.buildInitialFacetQuery(q.getQuery());
                for (Entry<String, Object> e : q.getFilterQueries()) {
                    String facetQuery = initFacetQ + " AND " + e.getKey() + ":" + e.getValue().toString();
                    srb.addFacet(FacetBuilders.queryFacet("ss_" + counter, createQSQB(facetQuery)));
                    counter++;
                }
            }
        });

        List<Entry<String, Long>> list = new ArrayList<Entry<String, Long>>();
        int counter = 0;
        boolean forceDateSuggestion = false;
        for (Entry<String, Object> e : q.getFilterQueries()) {
            QueryFacet qf = (QueryFacet) rsp.facets().facet("ss_" + counter);
            list.add(new MapEntry<String, Long>(e.getKey(), qf.count()));
            counter++;
            if (DATE.equals(e.getKey())) {
                try {
                    String str = (String) e.getValue();
                    int index = str.indexOf(" ");
                    // get from date
                    if (index > 0)
                        str = str.substring(1, index);
                    if ((new Date().getTime() - Helper.toDate(str).getTime()) / MyDate.ONE_DAY <= 1)
                        forceDateSuggestion = true;
                } catch (Exception ex) {
                }
            }
        }

        Helper.sortInplaceLongReverse(list);
        Collection<String> res = new LinkedHashSet<String>();
        for (Entry<String, Long> e : list) {
            if (e.getValue() > 0)
                res.add(e.getKey());
        }

        if (forceDateSuggestion)
            res.add(DATE);

        return res;
    }

    public List<JTweet> searchGeo(double lat, double lon, double length) {
        GeoDistanceFilterBuilder geoFilter = FilterBuilders.geoDistanceFilter("geo").
                lat(lat).lon(lon).distance(length, DistanceUnit.KILOMETERS).geoDistance(GeoDistance.PLANE);
        SearchRequestBuilder srb = createSearchBuilder();
        srb.setQuery(QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), geoFilter));
        return collectObjects(srb.execute().actionGet());
    }

    public Set<String> getQuerySuggestions(JetwickQuery query, SearchResponse rsp, long hits) {
        TermsFacet tags = (TermsFacet) rsp.facets().facet(ElasticTweetSearch.TAG);

        if (tags == null)
            return Collections.emptySet();
        Set<String> tmp = new LinkedHashSet<String>();
        for (TermsFacet.Entry e : tags.entries()) {
//            logger.info(e.term() + " " + e.count() + " " + hits);
            if (e.count() > hits / 10000.0 + 1) {
                boolean contains = false;
                for (String tmpTerm : tmp) {
                    if (e.term().contains(tmpTerm) || tmpTerm.contains(e.term()))
                        contains = true;
                }
                if (!contains)
                    tmp.add(e.term());
            }
        }

        Set<String> qSuggestions = new LinkedHashSet<String>();
        int counter = 0;
        for (String t : tmp) {
            if (query.getQuery().contains(t) || t.contains(query.getQuery()))
                continue;

            qSuggestions.add(query.getQuery() + " " + t);
            qSuggestions.add(query.getQuery() + " -" + t);

            if (++counter > 2)
                break;
        }

        if (qSuggestions.size() > 0)
            qSuggestions.add(query.getQuery());
        return qSuggestions;
    }

    public GetResponse findByTwitterIdRaw(Long twitterId) {
        return client.prepareGet(getIndexName(), getIndexType(), Long.toString(twitterId)).
                execute().actionGet();
    }

    SearchRequestBuilder createSearchBuilder(String indexName) {
        return client.prepareSearch(indexName).setTypes(getIndexType()).setVersion(true);
    }
    private Map<String, JTweet> tweets = new LinkedHashMap<String, JTweet>(100);
    private StopWatch sw = new StopWatch();
    private int tweetCounter = 0;
    private AtomicInteger feededTweets = new AtomicInteger(0);
    private Collection<JTweet> protectedTweets = new LinkedHashSet<JTweet>();
    private int feedCounter = 0;

    public int getFeededTweets() {
        return feededTweets.get();
    }

    @Override
    public void innerAdd(JTweet tw) {
        // do not add protected tweets and add them only once
        if (!tw.isProtected()) {
            JTweet existingTweet = tweets.put(tw.getId(), tw);           
            if (existingTweet != null) {
                existingTweet.updateFrom(tw);               
                tweets.put(existingTweet.getId(), existingTweet);
            }
        } else
            protectedTweets.add(tw);
    }

    @Override
    public void innerThreadMethod() throws InterruptedException {
        sw.start();
        boolean delete = testing || feedCounter++ % 400 == 0;
        // tweets can be updated from another thread (failed tweets)
        Collection<JTweet> res = update(new ArrayList<JTweet>(tweets.values()), createRemoveOlderThan().toDate(), delete);
        tweetCounter += res.size();
        feededTweets.set(res.size());
        sw.stop();
        if (tweetCounter > getBatchSize()) {
            logger.info("Updated " + tweetCounter + " tweets "
                    + tweetCounter / sw.getSeconds() + " per sec. Remaining:"
                    + getTodoObjects().size());
            logger.info("sw1:" + sw1.getSeconds() + "\t sw2:" + sw2.getSeconds()
                    + "\t sw3:" + sw3.getSeconds() + "\t sw4:" + sw4.getSeconds());
            tweetCounter = 0;
            sw = new StopWatch();
        }

        res.addAll(protectedTweets);
        for (AnyExecutor<JTweet> exec : commitListener) {
            for (JTweet tw : res) {
                exec.execute(tw);
            }
        }

        protectedTweets.clear();
        tweets.clear();
    }

    /**
     * Warning this is not real time!
     */
    public List<JTweet> findByUrl(String url) {
        SearchRequestBuilder srb = createSearchBuilder();
        srb.setSearchType(SearchType.QUERY_AND_FETCH);
        srb.setQuery(QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(),
                FilterBuilders.orFilter(
                    FilterBuilders.termFilter("dest_url_1_s", url),
                    FilterBuilders.termFilter("orig_url_1_s", url))));
        return collectObjects(srb.execute().actionGet());
    }

    public boolean tooOld(Date dt) {
        return dt.getTime() < System.currentTimeMillis()
                - ElasticTweetSearch.OLDEST_DT_IN_MILLIS;
    }

    @Override
    public void deleteAll(String indexName, String indexType) {
        protectedTweets.clear();
        tweets.clear();
        super.deleteAll(indexName, indexType);
    }       
}
TOP

Related Classes of de.jetwick.es.ElasticTweetSearch

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.