Package org.apache.nutch.protocol.httpclient

Source Code of org.apache.nutch.protocol.httpclient.Http

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.httpclient;

// JDK imports
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;

// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

// HTTP Client imports
import org.apache.commons.httpclient.Credentials;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NTCredentials;
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;

// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;


public class Http extends HttpBase {

  public static final Log LOG = LogFactory.getLog(Http.class);

  private static MultiThreadedHttpConnectionManager connectionManager =
          new MultiThreadedHttpConnectionManager();

  // Since the Configuration has not yet been setted,
  // then an unconfigured client is returned.
  private static HttpClient client = new HttpClient(connectionManager);

  static synchronized HttpClient getClient() {
    return client;
  }

  boolean verbose = false;
  int maxThreadsTotal = 10;
  String ntlmUsername = "";
  String ntlmPassword = "";
  String ntlmDomain = "";
  String ntlmHost = "";

  public Http() {
    super(LOG);
  }

  public void setConf(Configuration conf) {
    super.setConf(conf);
    this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
    this.ntlmUsername = conf.get("http.auth.ntlm.username", "");
    this.ntlmPassword = conf.get("http.auth.ntlm.password", "");
    this.ntlmDomain = conf.get("http.auth.ntlm.domain", "");
    this.ntlmHost = conf.get("http.auth.ntlm.host", "");
    //Level logLevel = Level.WARNING;
    //if (conf.getBoolean("http.verbose", false)) {
    //  logLevel = Level.FINE;
    //}
    //LOG.setLevel(logLevel);
    //Logger.getLogger("org.apache.commons.httpclient.HttpMethodDirector")
    //      .setLevel(logLevel);
    configureClient();
  }

  public static void main(String[] args) throws Exception {
    Http http = new Http();
    http.setConf(NutchConfiguration.create());
    main(http, args);
  }

  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
    throws ProtocolException, IOException {
    return new HttpResponse(this, url, datum, redirect);
  }
 
  private void configureClient() {

    // Set up an HTTPS socket factory that accepts self-signed certs.
    Protocol dummyhttps = new Protocol("https", new DummySSLProtocolSocketFactory(), 443);
    Protocol.registerProtocol("https", dummyhttps);
   
    HttpConnectionManagerParams params = connectionManager.getParams();
    params.setConnectionTimeout(timeout);
    params.setSoTimeout(timeout);
    params.setSendBufferSize(BUFFER_SIZE);
    params.setReceiveBufferSize(BUFFER_SIZE);
    params.setMaxTotalConnections(maxThreadsTotal);
    if (maxThreadsTotal > maxThreadsPerHost) {
      params.setDefaultMaxConnectionsPerHost(maxThreadsPerHost);
    } else {
      params.setDefaultMaxConnectionsPerHost(maxThreadsTotal);
    }

    HostConfiguration hostConf = client.getHostConfiguration();
    ArrayList headers = new ArrayList();
    // prefer English
    headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3"));
    // prefer UTF-8
    headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
    // prefer understandable formats
    headers.add(new Header("Accept",
            "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
    // accept gzipped content
    headers.add(new Header("Accept-Encoding", "x-gzip, gzip"));
    hostConf.getParams().setParameter("http.default-headers", headers);
    if (useProxy) {
      hostConf.setProxy(proxyHost, proxyPort);
    }
    if (ntlmUsername.length() > 0) {
      Credentials ntCreds = new NTCredentials(ntlmUsername, ntlmPassword, ntlmHost, ntlmDomain);
      client.getState().setCredentials(new AuthScope(ntlmHost, AuthScope.ANY_PORT), ntCreds);

      if (LOG.isInfoEnabled()) {
        LOG.info("Added NTLM credentials for " + ntlmUsername);
      }
    }
    if (LOG.isInfoEnabled()) { LOG.info("Configured Client"); }
  }
}
TOP

Related Classes of org.apache.nutch.protocol.httpclient.Http

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.