Package com.codeforces.graygoose.util

Source Code of com.codeforces.graygoose.util.UrlUtil

package com.codeforces.graygoose.util;

import com.codeforces.graygoose.model.Response;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.nio.client.CloseableHttpAsyncClient;
import org.apache.http.impl.nio.client.HttpAsyncClients;
import org.apache.http.message.BasicHeader;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Future;
import java.util.regex.Pattern;

public class UrlUtil {
    private static final Logger logger = Logger.getLogger(UrlUtil.class);

    private static final Header GRAYGOOSE_AGENT_HTTP_HEADER = new BasicHeader("X-User-Agent", "Graygoose");

    private static final String CONTENT_TYPE_HTTP_HEADER = "content-type";
    private static final String CHARSET_EQ = "charset=";
    private static final int CHARSET_EQ_LENGTH = CHARSET_EQ.length();

    private static final Pattern HTML_SPLIT_PATTERN = Pattern.compile("[<>]");
    private static final Pattern TRANSFORM_CHARSET_STRING_PATTERN = Pattern.compile("[\\r\\n\\s]+");

    public static Response fetchUrl(String urlString, int attemptCount) {
        return fetchUrls(Arrays.asList(urlString), attemptCount).get(0);
    }

    public static List<Response> fetchUrls(List<String> urlStrings, int attemptCount) {
        if (attemptCount < 1) {
            throw new IllegalArgumentException("Argument \'attemptCount\' can't be less than \'1\'.");
        }

        CloseableHttpAsyncClient client = HttpAsyncClients.createDefault();
        client.start();


        try {
            int urlStringCount = urlStrings.size();
            String[] urlStringsInternal = new String[urlStrings.size()];
            urlStrings.toArray(urlStringsInternal);

            Response[] responses = new Response[urlStringCount];
            @SuppressWarnings("unchecked") Future<HttpResponse>[] futureResponses = new Future[urlStringCount];

            for (int attemptIndex = 0; attemptIndex < attemptCount; ++attemptIndex) {
                fetchUrlsInternal(client, urlStringCount, urlStringsInternal, responses, futureResponses);
            }

            return Arrays.asList(responses);
        } finally {
            try {
                client.close();
            } catch (IOException e) {
                // No operations.
            }
        }
    }

    @SuppressWarnings({"OverlyLongMethod"})
    private static void fetchUrlsInternal(CloseableHttpAsyncClient client, int urlStringCount,
                                          String[] urlStringsInternal, Response[] responses, Future<HttpResponse>[] futureResponses) {
        for (int i = 0; i < urlStringCount; ++i) {
            if (responses[i] != null && responses[i].getCode() != -1) {
                continue;
            }

            String urlString = urlStringsInternal[i];
            logger.info("Start to fetch URL [" + urlString + "] ...");

            try {
                HttpGet request = new HttpGet(urlString);
                RequestConfig requestConfig = RequestConfig.custom()
                        .setConnectTimeout(10 * 1000)
                        .setConnectionRequestTimeout(20 * 1000)
                        .setSocketTimeout(30 * 1000)
                        .build();
                request.setConfig(requestConfig);

                request.setHeader(GRAYGOOSE_AGENT_HTTP_HEADER);

                futureResponses[i] = client.execute(request, null);
            } catch (RuntimeException e) {
                logger.error("Fetch error: " + e.getMessage() + '.');
                futureResponses[i] = null;
                responses[i] = Response.newResponse(urlString, -1, e.getMessage());
            }
        }

        for (int i = 0; i < urlStringCount; ++i) {
            if (responses[i] != null && responses[i].getCode() != -1) {
                continue;
            }

            Future<HttpResponse> futureResponse = futureResponses[i];

            if (futureResponse != null) {
                String urlString = urlStringsInternal[i];

                try {
                    HttpResponse httpResponse = futureResponse.get();
                    logger.info("URL [" + urlString + "] has been successfully fetched.");

                    int responseCode = httpResponse.getStatusLine().getStatusCode();
                    Charset responseCharset = getHttpResponseCharset(httpResponse);
                    System.out.println("Headers charset: " + responseCharset);

                    byte[] responseContent = IOUtils.toByteArray(httpResponse.getEntity().getContent());

                    String responseText = responseCharset == null ?
                            new String(responseContent) :
                            new String(responseContent, responseCharset);

                    if (responseCharset == null) {
                        responseCharset = getCharsetByHtml(responseText);
                        System.out.println("Html charset: " + responseCharset);
                        if (responseCharset != null) {
                            responseText = new String(responseContent, responseCharset);
                        }
                    }

                    responses[i] = Response.newResponse(urlString, responseCode, responseText);
                } catch (Exception e) {
                    logger.error("Fetch error: " + e.getMessage() + '.');
                    responses[i] = Response.newResponse(urlString, -1, e.getMessage());
                }
            }
        }
    }

    private static Charset getHttpResponseCharset(HttpResponse httpResponse) {
        Header[] headers = httpResponse.getAllHeaders();

        for (Header header : headers) {
            String headerName = header.getName();
            String headerValue = header.getValue();

            if (CONTENT_TYPE_HTTP_HEADER.equalsIgnoreCase(headerName)) {
                return extractCharset(headerValue);
            }
        }

        return null;
    }

    private static Charset extractCharset(String s) {
        s = TRANSFORM_CHARSET_STRING_PATTERN.matcher(s).replaceAll("");
        int sLength = s.length();
        int charsetEqPos = s.indexOf(CHARSET_EQ);

        if (charsetEqPos >= 0) {
            int charsetNamePos = charsetEqPos + CHARSET_EQ_LENGTH;

            int endPos = charsetNamePos;
            while (endPos < sLength && isCharsetCharacter(s.charAt(endPos))) {
                ++endPos;
            }

            String charsetName = (endPos == -1 ?
                    s.substring(charsetNamePos) :
                    s.substring(charsetNamePos, endPos)).trim();

            return Charset.isSupported(charsetName) ? Charset.forName(charsetName) : null;
        } else {
            return null;
        }
    }

    /**
     * @param c Character to analyze.
     * @return {@code true} iff character is a part of charset or whitespace.
     */
    private static boolean isCharsetCharacter(char c) {
        return Character.isWhitespace(c)
                || Character.isLetterOrDigit(c)
                || c == '-';
    }

    private static Charset getCharsetByHtml(String html) {
        html = html.toLowerCase();
        int headEnd = html.indexOf("</head>");

        if (headEnd > 0) {
            String head = html.substring(0, headEnd);
            String[] tokens = HTML_SPLIT_PATTERN.split(head);
            for (String token : tokens) {
                if (token.contains(CONTENT_TYPE_HTTP_HEADER)) {
                    return extractCharset(token);
                }
            }
        }

        return null;
    }

    private UrlUtil() {
    }
}
TOP

Related Classes of com.codeforces.graygoose.util.UrlUtil

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.