Package bixo.fetcher

Source Code of bixo.fetcher.LoggingFetcher

/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.fetcher;

import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;

import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.scaleunlimited.cascading.Payload;


import bixo.config.FetcherPolicy;
import bixo.config.UserAgent;
import bixo.datum.ContentBytes;
import bixo.datum.FetchedDatum;
import bixo.datum.HttpHeaders;
import bixo.datum.ScoredUrlDatum;
import bixo.exceptions.BaseFetchException;
import bixo.exceptions.HttpFetchException;
import bixo.exceptions.UrlFetchException;

@SuppressWarnings("serial")
public class LoggingFetcher extends BaseFetcher {
    private static final Logger LOGGER = LoggerFactory.getLogger(LoggingFetcher.class);
   
    public static final String FAKE_CONTENT_LOCATION = "Fake-LoggingFetcher";
   
    // Generic HTML page we send back for every request - only customization is the URL
    private static final String HTML_TEMPLATE =
        "<!DOCTYPE HTML PUBLIC \"-//BBSW//DTD Compact HTML 2.0//EN\">\n" +
        "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n" +
        "<title>LoggingFetcher</title>\n" +
        "</head><body>URL = %s</body></html>\n";
   
    public LoggingFetcher(int maxThreads) {
        super(maxThreads, new FetcherPolicy(), new UserAgent("agentName", "agentName@domain.com", "http://agentName.domain.com"));
    }


    @Override
    public FetchedDatum get(ScoredUrlDatum datum) throws BaseFetchException {
        String url = datum.getUrl();
        Payload payload = datum.getPayload();
        logPayload(url, payload);
       
        // Create a simple HTML page here, where we fill in the URL as
        // the field, and return that as the BytesWritable. we could add
        // more of the datum values to the template if we cared.
        try {
            return makeFetchedDatum(url, String.format(HTML_TEMPLATE, url), payload);
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("Should never happen", e);
        } catch (MalformedURLException e) {
            throw new UrlFetchException(url, e.getMessage());
        }
    }

    private FetchedDatum makeFetchedDatum(String url, String htmlContent, Payload payload) throws MalformedURLException, HttpFetchException, UnsupportedEncodingException {
        URL theUrl = new URL(url);
        if (theUrl.getFile().equals("/robots.txt")) {
            throw new HttpFetchException(url, "Never return robots.txt from LoggingFetcher", HttpStatus.SC_NOT_FOUND, null);
        }
       
        byte[] content = htmlContent.getBytes("UTF-8");
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_LENGTH, "" + content.length);
        headers.add(HttpHeaderNames.CONTENT_TYPE, "text/html");
       
        // Set the location to a fixed value, so that when we're processing entries from
        // the URL DB that might have been set using fake content, we know to ignore the
        // refetch time if we're doing a real fetch.
        headers.add(HttpHeaderNames.CONTENT_LOCATION, FAKE_CONTENT_LOCATION);
        FetchedDatum result = new FetchedDatum(url, url, System.currentTimeMillis(), headers, new ContentBytes(content), "text/html", 100000);
        result.setPayload(payload);
        return result;
    }


    private void logPayload(String url, Payload payload) {
        StringBuilder msg = new StringBuilder(url);
        msg.append(" ( ");
        for (String key : payload.keySet()) {
            msg.append(key);
            msg.append(':');
            Object value = payload.get(key);
            msg.append(value == null ? "null" : value.toString());
            msg.append(' ');
        }
        msg.append(")");

        LOGGER.info(msg.toString());
    }

    @Override
    public void abort() {
        // Do nothing
    }

}
TOP

Related Classes of bixo.fetcher.LoggingFetcher

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.