Package org.apache.nutch.protocol.ftp

Source Code of org.apache.nutch.protocol.ftp.FtpResponse

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.protocol.ftp;

import org.apache.commons.net.ftp.FTP;
import org.apache.commons.net.ftp.FTPFile;
import org.apache.commons.net.ftp.FTPReply;
import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
import org.apache.commons.net.ftp.parser.ParserInitializationException;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.hadoop.conf.Configuration;

import java.net.InetAddress;
import java.net.URL;
import java.util.List;
import java.util.LinkedList;
import java.io.ByteArrayOutputStream;
import java.io.IOException;

/**
* FtpResponse.java mimics ftp replies as http response.
* It tries its best to follow http's way for headers, response codes
* as well as exceptions.
*
* Comments:
* In this class, all FtpException*.java thrown by Client.java
* and some important commons-net exceptions passed by Client.java
* must have been properly dealt with. They'd better not be leaked
* to the caller of this class.
*/
public class FtpResponse {

  private String orig;
  private String base;
  private byte[] content;
  private static final byte[] EMPTY_CONTENT = new byte[0];
  private int code;
  private Metadata headers = new Metadata();

  private final Ftp ftp;
  private Configuration conf;

  /** Returns the response code. */
  public int getCode() { return code; }

  /** Returns the value of a named header. */
  public String getHeader(String name) {
    return headers.get(name);
  }

  public byte[] getContent() { return content; }

  public Content toContent() {
    return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
                       getHeader(Response.CONTENT_TYPE),
                       headers, this.conf);
  }

  public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
    throws FtpException, IOException {

    this.orig = url.toString();
    this.base = url.toString();
    this.ftp = ftp;
    this.conf = conf;

    if (!"ftp".equals(url.getProtocol()))
      throw new FtpException("Not a ftp url:" + url);

    if (url.getPath() != url.getFile()) {
      if (Ftp.LOG.isWarnEnabled()) {
        Ftp.LOG.warn("url.getPath() != url.getFile(): " + url);
      }
    }

    String path = "".equals(url.getPath()) ? "/" : url.getPath();

    try {

      if (ftp.followTalk) {
        if (Ftp.LOG.isInfoEnabled()) {
          Ftp.LOG.info("fetching "+url);
        }
      } else {
        if (Ftp.LOG.isTraceEnabled()) {
          Ftp.LOG.trace("fetching "+url);
        }
      }

      InetAddress addr = InetAddress.getByName(url.getHost());
      if (addr != null
          && conf.getBoolean("store.ip.address", false) == true) {
        headers.add("_ip_", addr.getHostAddress());
      }

      // idled too long, remote server or ourselves may have timed out,
      // should start anew.
      if (ftp.client != null && ftp.keepConnection
          && ftp.renewalTime < System.currentTimeMillis()) {
        if (Ftp.LOG.isInfoEnabled()) {
          Ftp.LOG.info("delete client because idled too long");
        }
        ftp.client = null;
      }

      // start anew if needed
      if (ftp.client == null) {
        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
          Ftp.LOG.info("start client");
        }
        // the real client
        ftp.client = new Client();
        // when to renew, take the lesser
        //ftp.renewalTime = System.currentTimeMillis()
        //  + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : ftp.serverTimeout);

        // timeout for control connection
        ftp.client.setDefaultTimeout(ftp.timeout);
        // timeout for data connection
        ftp.client.setDataTimeout(ftp.timeout);

        // follow ftp talk?
        if (ftp.followTalk)
          ftp.client.addProtocolCommandListener(
            new PrintCommandListener(Ftp.LOG));
      }

      // quit from previous site if at a different site now
      if (ftp.client.isConnected()) {
        InetAddress remoteAddress = ftp.client.getRemoteAddress();
        if (!addr.equals(remoteAddress)) {
          if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
            Ftp.LOG.info("disconnect from "+remoteAddress
            +" before connect to "+addr);
          }
          // quit from current site
          ftp.client.logout();
          ftp.client.disconnect();
        }
      }

      // connect to current site if needed
      if (!ftp.client.isConnected()) {

        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
          Ftp.LOG.info("connect to "+addr);
        }

        ftp.client.connect(addr);
        if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) {
          ftp.client.disconnect();
          if (Ftp.LOG.isWarnEnabled()) {
            Ftp.LOG.warn("ftp.client.connect() failed: "
              + addr + " " + ftp.client.getReplyString());
          }
          this.code = 500; // http Internal Server Error
          return;
        }

        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
          Ftp.LOG.info("log into "+addr);
        }

        if (!ftp.client.login(ftp.userName, ftp.passWord)) {
          // login failed.
          // please note that some server may return 421 immediately
          // after USER anonymous, thus ftp.client.login() won't return false,
          // but throw exception, which then will be handled by caller
          // (not dealt with here at all) .
          ftp.client.disconnect();
          if (Ftp.LOG.isWarnEnabled()) {
            Ftp.LOG.warn("ftp.client.login() failed: "+addr);
          }
          this.code = 401// http Unauthorized
          return;
        }

        // insist on binary file type
        if (!ftp.client.setFileType(FTP.BINARY_FILE_TYPE)) {
          ftp.client.logout();
          ftp.client.disconnect();
          if (Ftp.LOG.isWarnEnabled()) {
            Ftp.LOG.warn("ftp.client.setFileType() failed: "+addr);
          }
          this.code = 500; // http Internal Server Error
          return;
        }

        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
          Ftp.LOG.info("set parser for "+addr);
        }

        // SYST is valid only after login
        try {
          ftp.parser = null;
          String parserKey = ftp.client.getSystemName();
          // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8
          if (parserKey.startsWith("UNKNOWN Type: L8"))
            parserKey = "UNIX Type: L8";
          ftp.parser = (new DefaultFTPFileEntryParserFactory())
            .createFileEntryParser(parserKey);
        } catch (FtpExceptionBadSystResponse e) {
          if (Ftp.LOG.isWarnEnabled()) {
            Ftp.LOG.warn("ftp.client.getSystemName() failed: "+addr+" "+e);
          }
          ftp.parser = null;
        } catch (ParserInitializationException e) {
          // ParserInitializationException is RuntimeException defined in
          // org.apache.commons.net.ftp.parser.ParserInitializationException
          if (Ftp.LOG.isWarnEnabled()) {
            Ftp.LOG.warn("createFileEntryParser() failed. "+addr+" "+e);
          }
          ftp.parser = null;
        } finally {
          if (ftp.parser == null) {
            // do not log as severe, otherwise
            // FetcherThread/RequestScheduler will abort
            if (Ftp.LOG.isWarnEnabled()) {
              Ftp.LOG.warn("ftp.parser is null: "+addr);
            }
            ftp.client.logout();
            ftp.client.disconnect();
            this.code = 500; // http Internal Server Error
            return;
          }
        }

      } else {
        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
          Ftp.LOG.info("use existing connection");
        }
      }

      this.content = null;

      if (path.endsWith("/")) {
        getDirAsHttpResponse(path, datum.getModifiedTime());
      } else {
        getFileAsHttpResponse(path, datum.getModifiedTime());
      }

      // reset next renewalTime, take the lesser
      if (ftp.client != null && ftp.keepConnection) {
        ftp.renewalTime = System.currentTimeMillis()
          + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : ftp.serverTimeout);
        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
          Ftp.LOG.info("reset renewalTime to "
            + HttpDateFormat.toString(ftp.renewalTime));
        }
      }

      // getDirAsHttpResponse() or getFileAsHttpResponse() above
      // may have deleted ftp.client
      if (ftp.client != null && !ftp.keepConnection) {
        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
          Ftp.LOG.info("disconnect from "+addr);
        }
        ftp.client.logout();
        ftp.client.disconnect();
      }
     
    } catch (Exception e) {
      if (Ftp.LOG.isWarnEnabled()) {
        Ftp.LOG.warn("Error: ", e);
      }
      // for any un-foreseen exception (run time exception or not),
      // do ultimate clean and leave ftp.client for garbage collection
      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
        Ftp.LOG.info("delete client due to exception");
      }
      ftp.client = null;
      // or do explicit garbage collection?
      // System.gc();
// can we be less dramatic, using the following instead?
// probably unnecessary for our practical purpose here
//      try {
//        ftp.client.logout();
//        ftp.client.disconnect();
//      }
      throw new FtpException(e);
      //throw e;
    }

  }

  // get ftp file as http response
  private void getFileAsHttpResponse(String path, long lastModified)
    throws IOException {

    ByteArrayOutputStream os = null;
    List<FTPFile> list = null;

    try {
      // first get its possible attributes
      list = new LinkedList<FTPFile>();
      ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);

      FTPFile ftpFile = (FTPFile) list.get(0);
      this.headers.set(Response.CONTENT_LENGTH,
                       new Long(ftpFile.getSize()).toString());
      this.headers.set(Response.LAST_MODIFIED,
                       HttpDateFormat.toString(ftpFile.getTimestamp()));
      // don't retrieve the file if not changed.
      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
        code = 304;
        return;
      }
      os = new ByteArrayOutputStream(ftp.getBufferSize());
      ftp.client.retrieveFile(path, os, ftp.maxContentLength);

      this.content = os.toByteArray();

//      // approximate bytes sent and read
//      if (this.httpAccounting != null) {
//        this.httpAccounting.incrementBytesSent(path.length());
//        this.httpAccounting.incrementBytesRead(this.content.length);
//      }

      this.code = 200; // http OK

    } catch (FtpExceptionControlClosedByForcedDataClose e) {

      // control connection is off, clean up
      // ftp.client.disconnect();
      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
        Ftp.LOG.info("delete client because server cut off control channel: "+e);
      }
      ftp.client = null;

      // in case this FtpExceptionControlClosedByForcedDataClose is
      // thrown by retrieveList() (not retrieveFile()) above,
      if (os == null) { // indicating throwing by retrieveList()
        //throw new FtpException("fail to get attibutes: "+path);
        if (Ftp.LOG.isWarnEnabled()) {
          Ftp.LOG.warn(
              "Please try larger maxContentLength for ftp.client.retrieveList(). "
            + e);
        }
        // in a way, this is our request fault
        this.code = 400// http Bad request
        return;
      }

      FTPFile ftpFile = (FTPFile) list.get(0);
      this.headers.set(Response.CONTENT_LENGTH,
                       new Long(ftpFile.getSize()).toString());
      //this.headers.put("content-type", "text/html");
      this.headers.set(Response.LAST_MODIFIED,
                      HttpDateFormat.toString(ftpFile.getTimestamp()));
      this.content = os.toByteArray();
      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
        code = 304;
        return;
      }

//      // approximate bytes sent and read
//      if (this.httpAccounting != null) {
//        this.httpAccounting.incrementBytesSent(path.length());
//        this.httpAccounting.incrementBytesRead(this.content.length);
//      }

      this.code = 200; // http OK

    } catch (FtpExceptionCanNotHaveDataConnection e) {

      if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
      // it is not a file, but dir, so redirect as a dir
        this.headers.set(Response.LOCATION, path + "/");
        this.code = 300// http redirect
        // fixme, should we do ftp.client.cwd("/"), back to top dir?
      } else {
      // it is not a dir either
        this.code = 404// http Not Found
      }

    } catch (FtpExceptionUnknownForcedDataClose e) {
      // Please note control channel is still live.
      // in a way, this is our request fault
      if (Ftp.LOG.isWarnEnabled()) {
        Ftp.LOG.warn(
            "Unrecognized reply after forced close of data channel. "
          + "If this is acceptable, please modify Client.java accordingly. "
          + e);
      }
      this.code = 400; // http Bad Request
    }

  }

  // get ftp dir list as http response
  private void getDirAsHttpResponse(String path, long lastModified)
    throws IOException {
    List<FTPFile> list = new LinkedList<FTPFile>();

    try {

      // change to that dir first
      if (!FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
        this.code = 404// http Not Found
        return;
      }

      // fixme, should we do ftp.client.cwd("/"), back to top dir?

      ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser);
      this.content = list2html(list, path, "/".equals(path) ? false : true);
      this.headers.set(Response.CONTENT_LENGTH,
                       new Integer(this.content.length).toString());
      this.headers.set(Response.CONTENT_TYPE, "text/html");
      // this.headers.put("Last-Modified", null);

//      // approximate bytes sent and read
//      if (this.httpAccounting != null) {
//        this.httpAccounting.incrementBytesSent(path.length());
//        this.httpAccounting.incrementBytesRead(this.content.length);
//      }

      this.code = 200; // http OK

    } catch (FtpExceptionControlClosedByForcedDataClose e) {

      // control connection is off, clean up
      // ftp.client.disconnect();
      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
        Ftp.LOG.info("delete client because server cut off control channel: "+e);
      }
      ftp.client = null;

      this.content = list2html(list, path, "/".equals(path) ? false : true);
      this.headers.set(Response.CONTENT_LENGTH,
                       new Integer(this.content.length).toString());
      this.headers.set(Response.CONTENT_TYPE, "text/html");
      // this.headers.put("Last-Modified", null);

//      // approximate bytes sent and read
//      if (this.httpAccounting != null) {
//        this.httpAccounting.incrementBytesSent(path.length());
//        this.httpAccounting.incrementBytesRead(this.content.length);
//      }

      this.code = 200; // http OK

    } catch (FtpExceptionUnknownForcedDataClose e) {
      // Please note control channel is still live.
      // in a way, this is our request fault
      if (Ftp.LOG.isWarnEnabled()) {
        Ftp.LOG.warn(
            "Unrecognized reply after forced close of data channel. "
          + "If this is acceptable, please modify Client.java accordingly. "
          + e);
      }
      this.code = 400; // http Bad Request
    } catch (FtpExceptionCanNotHaveDataConnection e) {
      if (Ftp.LOG.isWarnEnabled()) { Ftp.LOG.warn(""+ e); }
      this.code = 500; // http Iternal Server Error
    }

  }

  // generate html page from ftp dir list
  private byte[] list2html(List<FTPFile> list, String path, boolean includeDotDot) {

    //StringBuffer x = new StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
    StringBuffer x = new StringBuffer("<html><head>");
    x.append("<title>Index of "+path+"</title></head>\n");
    x.append("<body><h1>Index of "+path+"</h1><pre>\n");

    if (includeDotDot) {
      x.append("<a href='../'>../</a>\t-\t-\t-\n");
    }

    for (int i=0; i<list.size(); i++) {
      FTPFile f = (FTPFile) list.get(i);
      String name = f.getName();
      String time = HttpDateFormat.toString(f.getTimestamp());
      if (f.isDirectory()) {
        // some ftp server LIST "." and "..", we skip them here
        if (name.equals(".") || name.equals(".."))
          continue;
        x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
        x.append(time+"\t-\n");
      } else if (f.isFile()) {
        x.append("<a href='"+name+    "'>"+name+"</a>\t");
        x.append(time+"\t"+f.getSize()+"\n");
      } else {
        // ignore isSymbolicLink()
        // ignore isUnknown()
      }
    }

    x.append("</pre></body></html>\n");

    return new String(x).getBytes();
  }

}
TOP

Related Classes of org.apache.nutch.protocol.ftp.FtpResponse

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.