Source Code of com.rarchives.ripme.ripper.rippers.TwitterRipper

package com.rarchives.ripme.ripper.rippers;


import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONTokener;
import org.jsoup.nodes.Document;


import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.utils.Http;
import com.rarchives.ripme.utils.Utils;


public class TwitterRipper extends AlbumRipper {


    private static final String DOMAIN = "twitter.com",
                                HOST   = "twitter";


    private static final int MAX_REQUESTS = Utils.getConfigInteger("twitter.max_requests", 10);
    private static final int WAIT_TIME = 2000;


    // Base 64 of consumer key : consumer secret
    private String authKey;
    private String accessToken;


    private enum ALBUM_TYPE {
        ACCOUNT,
        SEARCH
    }
    private ALBUM_TYPE albumType;
    private String searchText, accountName;
    
    public TwitterRipper(URL url) throws IOException {
        super(url);
        authKey = Utils.getConfigString("twitter.auth", null);
        if (authKey == null) {
            throw new IOException("Could not find twitter authentication key in configuration");
        }
    }


    @Override
    public boolean canRip(URL url) {
        return url.getHost().endsWith(DOMAIN);
    }


    @Override
    public URL sanitizeURL(URL url) throws MalformedURLException {
        // https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd
        Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?q=([a-zA-Z0-9%\\-_]{1,}).*$");
        Matcher m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            albumType = ALBUM_TYPE.SEARCH;
            searchText = m.group(2);
            return url;
        }
        p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9\\-_]{1,}).*$");
        m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            albumType = ALBUM_TYPE.ACCOUNT;
            accountName = m.group(2);
            return url;
        }
        throw new MalformedURLException("Expected username or search string in url: " + url);
    }
    
    private void getAccessToken() throws IOException {
        Document doc = Http.url("https://api.twitter.com/oauth2/token")
                           .ignoreContentType()
                           .header("Authorization", "Basic " + authKey)
                           .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
                           .header("User-agent", "ripe and zipe")
                           .data("grant_type", "client_credentials")
                           .post();
        String body = doc.body().html().replaceAll("&quot;", "\"");
        try {
            JSONObject json = new JSONObject(body);
            accessToken = json.getString("access_token");
            return;
        } catch (JSONException e) {
            // Fall through
            throw new IOException("Failure while parsing JSON: " + body, e);
        }
    }
    
    private void checkRateLimits(String resource, String api) throws IOException {
        Document doc = Http.url("https://api.twitter.com/1.1/application/rate_limit_status.json?resources=" + resource)
                            .ignoreContentType()
                            .header("Authorization", "Bearer " + accessToken)
                            .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
                            .header("User-agent", "ripe and zipe")
                            .get();
        String body = doc.body().html().replaceAll("&quot;", "\"");
        try {
            JSONObject json = new JSONObject(body);
            JSONObject stats = json.getJSONObject("resources")
                                   .getJSONObject(resource)
                                   .getJSONObject(api);
            int remaining = stats.getInt("remaining");
            logger.info("    Twitter " + resource + " calls remaining: " + remaining);
            if (remaining < 20) {
                logger.error("Twitter API calls exhausted: " + stats.toString());
                throw new IOException("Less than 20 API calls remaining; not enough to rip.");
            }
        } catch (JSONException e) {
            logger.error("JSONException: ", e);
            throw new IOException("Error while parsing JSON: " + body, e);
        }
    }
    
    private String getApiURL(Long maxID) {
        StringBuilder req = new StringBuilder();
        switch (albumType) {
        case ACCOUNT:
            req.append("https://api.twitter.com/1.1/statuses/user_timeline.json")
               .append("?screen_name=" + this.accountName)
               .append("&include_entities=true")
               .append("&exclude_replies=true")
               .append("&trim_user=true")
               .append("&include_rts=false")
               .append("&count=" + 200);
            break;
        case SEARCH:
            req.append("https://api.twitter.com/1.1/search/tweets.json")
               .append("?q=" + this.searchText)
               .append("&include_entities=true")
               .append("&result_type=recent")
               .append("&count=100");
            break;
        }
        if (maxID > 0) {
            req.append("&max_id=" + Long.toString(maxID));
        }
        return req.toString();
    }
    
    private List<JSONObject> getTweets(String url) throws IOException {
        List<JSONObject> tweets = new ArrayList<JSONObject>();
        logger.info("    Retrieving " + url);
        Document doc = Http.url(url)
                           .ignoreContentType()
                           .header("Authorization", "Bearer " + accessToken)
                           .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
                           .header("User-agent", "ripe and zipe")
                           .get();
        String body = doc.body().html().replaceAll("&quot;", "\"");
        Object jsonObj = new JSONTokener(body).nextValue();
        JSONArray statuses;
        if (jsonObj instanceof JSONObject) {
            JSONObject json = (JSONObject) jsonObj;
            if (json.has("errors")) {
                String msg = json.getJSONObject("errors").getString("message");
                throw new IOException("Twitter responded with errors: " + msg);
            }
            statuses = json.getJSONArray("statuses");
        } else {
            statuses = (JSONArray) jsonObj;
        }
        for (int i = 0; i < statuses.length(); i++) {
            tweets.add((JSONObject) statuses.get(i));
        }
        return tweets;
    }


    private void parseTweet(JSONObject tweet) throws MalformedURLException {
        if (!tweet.has("entities")) {
            logger.error("XXX Tweet doesn't have entitites");
            return;
        }


        JSONObject entities = tweet.getJSONObject("entities");


        if (entities.has("media")) {
            JSONArray medias = entities.getJSONArray("media");
            String url;
            JSONObject media;
            for (int i = 0; i < medias.length(); i++) {
                media = (JSONObject) medias.get(i);
                url = media.getString("media_url");
                if (url.contains(".twimg.com/")) {
                    url += ":large";
                }
                addURLToDownload(new URL(url));
            }
        }


        if (entities.has("urls")) {
            JSONArray urls = entities.getJSONArray("urls");
            JSONObject url;
            for (int i = 0; i < urls.length(); i++) {
                url = (JSONObject) urls.get(i);
                if (url.get("expanded_url") != null) {
                    handleTweetedURL(url.getString("url"));
                } else {
                    handleTweetedURL(url.getString("expanded_url"));
                }
            }
        }
    }
    
    private void handleTweetedURL(String url) {
        logger.error("[!] Need to handle URL: " + url);
    }


    @Override
    public void rip() throws IOException {
        getAccessToken();


        switch (albumType) {
        case ACCOUNT:
            checkRateLimits("statuses", "/statuses/user_timeline");
            break;
        case SEARCH:
            checkRateLimits("search", "/search/tweets");
            break;
        }


        Long lastMaxID = 0L;
        for (int i = 0; i < MAX_REQUESTS; i++) {
            List<JSONObject> tweets = getTweets(getApiURL(lastMaxID - 1));
            if (tweets.size() == 0) {
                logger.info("   No more tweets found.");
                break;
            }
            logger.debug("Twitter response #" + (i + 1) + " Tweets:\n" + tweets);
            if (tweets.size() == 1 && 
                    lastMaxID.equals(tweets.get(0).getString("id_str"))
                ) {
                break;
            }
            
            for (JSONObject tweet : tweets) {
                lastMaxID = tweet.getLong("id");
                parseTweet(tweet);
            }


            try {
                Thread.sleep(WAIT_TIME);
            }
            catch (InterruptedException e) {
                logger.error("[!] Interrupted while waiting to load more results", e);
                break;
            }
        }


        waitForThreads();
    }


    @Override
    public String getHost() {
        return HOST;
    }


    @Override
    public String getGID(URL url) throws MalformedURLException {
        switch (albumType) {
        case ACCOUNT:
            return "account_" + accountName;
        case SEARCH:
            StringBuilder gid = new StringBuilder();
            for (int i = 0; i < searchText.length(); i++) {
                char c = searchText.charAt(i);
                // Ignore URL-encoded chars
                if (c == '%') {
                    gid.append('_');
                    i += 2;
                    continue;
                // Ignore non-alphanumeric chars
                } else if (
                        (c >= 'a' && c <= 'z')
                     || (c >= 'A' && c <= 'Z') 
                     || (c >= '0' && c <= '9') 
                        ) {
                    gid.append(c);
                }
            }
            return "search_" + gid.toString();
        }
        throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
    }


}
Source Code of com.rarchives.ripme.ripper.rippers.TwitterRipper

Related Classes of com.rarchives.ripme.ripper.rippers.TwitterRipper