Package org.apache.nutch.protocol.sftp

Source Code of org.apache.nutch.protocol.sftp.Sftp

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.protocol.sftp;

//JDK imports
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.Collections;
import java.util.Hashtable;
import java.util.Map;
import java.util.Vector;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;

//APACHE imports
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Logger;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.RobotRules;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;

//JSCH imports
import com.jcraft.jsch.ChannelSftp;
import com.jcraft.jsch.JSch;
import com.jcraft.jsch.JSchException;
import com.jcraft.jsch.Session;
import com.jcraft.jsch.SftpException;
import com.jcraft.jsch.ChannelSftp.LsEntry;

/**
* This class uses the Jsch package to fetch content using the Sftp protocol.
*
*/
public class Sftp implements Protocol {

  private static final Logger logger = Logger.getLogger(Sftp.class);
  private static final Map<String, BlockingQueue<ChannelSftp>> channelSftpByHostMap = new Hashtable<String, BlockingQueue<ChannelSftp>>();

  private Configuration configuration;

  private String server;
  private int port;
  private String user;
  private String password;

  public Sftp() {
  }

  public ProtocolOutput getProtocolOutput(String url, WebPage page) {
    URL sUrl = null;
    String urlStr = url.toString().trim();

    ChannelSftp channelSftp = null;
    try {
      sUrl = new URL(urlStr);
      channelSftp = getChannelSftp(sUrl);

      String urlFile = sUrl.getFile();
      if (urlFile.endsWith(".htm") || urlFile.endsWith(".html")) {
        ProtocolOutput po = getFileProtocolOutput(sUrl, channelSftp,
            "text/html");
        return po;
      } else if (urlFile.endsWith(".pdf")) {
        ProtocolOutput po = getFileProtocolOutput(sUrl, channelSftp,
            "application/pdf");
        return po;
      } else {
        ProtocolOutput po = getDirectoryProtocolOutput(sUrl, channelSftp);
        return po;
      }
    } catch (MalformedURLException e) {
      logger.error("Bad URL String: " + urlStr, e);
      return null;
    } catch (InterruptedException e) {
      return null;
    } catch (SftpException e) {
      return null;
    } catch (IOException e) {
      return null;
    } catch (Exception e) {
      logger.error("Unknown Exception in getProtocolOutput()", e);
      return null;
    } finally {
      if (channelSftp != null) {
        try {
          putChannelSftp(sUrl, channelSftp);
        } catch (InterruptedException e) {
          logger.error("Cannot return ChannelSftp object to Queue", e);
        }
      }
    }
  }

  private ChannelSftp getChannelSftp(URL url) throws InterruptedException {
    String host = url.getHost();
    BlockingQueue<ChannelSftp> queue = channelSftpByHostMap.get(host);
    if (queue == null) {
      return null;
    }

    try {
      ChannelSftp cSftp = queue.take();
      return cSftp;
    } catch (InterruptedException e) {
      logger
          .error("Wait for getChannelSftp() interrupted for host: " + host, e);
      throw e;
    }
  }

  private void putChannelSftp(URL url, ChannelSftp cSftp)
      throws InterruptedException {
    String host = url.getHost();
    BlockingQueue<ChannelSftp> queue = channelSftpByHostMap.get(host);
    if (queue == null) {
      return;
    }

    try {
      queue.put(cSftp);
    } catch (InterruptedException e) {
      logger
          .error("Wait for putChannelSftp() interrupted for host: " + host, e);
      throw e;
    }
  }

  private ProtocolOutput getFileProtocolOutput(URL url,
      ChannelSftp channelSftp, String contentType) throws SftpException,
      IOException {
    byte[] bytes = null;
    InputStream iStream = null;
    try {
      int size = (int) channelSftp.lstat(url.getFile()).getSize();
      iStream = channelSftp.get(url.getFile());
      bytes = new byte[size];
      iStream.read(bytes);
    } catch (SftpException e) {
      logger.error("SftpException in getFileProtocolOutput(), file: "
          + url.getFile(), e);
      throw e;
    } catch (IOException e) {
      logger.error("IOException in getFileProtocolOutput(), file: "
          + url.getFile(), e);
      throw e;
    } finally {
      if (iStream != null) {
        iStream.close();
      }
    }

    String urlStr = url.toExternalForm();

    Metadata metadata = new Metadata();
    metadata.set(Response.CONTENT_TYPE, contentType);
    metadata.set(Response.CONTENT_LENGTH, String.valueOf(bytes.length));
    metadata.set(Response.LAST_MODIFIED, channelSftp.lstat(url.getFile())
        .getMtimeString());
    metadata.set(Response.LOCATION, urlStr);

    Content content = new Content(urlStr, urlStr, bytes, contentType, metadata,
        configuration);
    ProtocolOutput po = new ProtocolOutput(content);
    return po;
  }

  private ProtocolOutput getDirectoryProtocolOutput(URL url,
      ChannelSftp channelSftp) throws SftpException {
    try {
      channelSftp.cd(url.getFile());

      int count = 1;
      String directoryList = "<html><body>";
      Vector<LsEntry> vector = (Vector<LsEntry>) channelSftp.ls(".");
      for (LsEntry entry : vector) {
        String fileName = entry.getFilename();
        if (!fileName.equals(".") && !fileName.equals("..")) {
          directoryList += "<a href=\"" + url + "/" + fileName + "\">" + count
              + "</a>\r\n";
        }
        count++;
      }

      directoryList += "</body></html>";

      Metadata metadata = new Metadata();
      metadata.set(Response.CONTENT_TYPE, "text/html");
      metadata.set(Response.CONTENT_LENGTH, String.valueOf(directoryList
          .length()));
      metadata.set(Response.LAST_MODIFIED, channelSftp.lstat(url.getFile())
          .getMtimeString());
      metadata.set(Response.LOCATION, url.toExternalForm());

      Content content = new Content(url.toExternalForm(), url.toExternalForm(),
          directoryList.getBytes(), "text/html", metadata, configuration);
      ProtocolOutput po = new ProtocolOutput(content);
      return po;
    } catch (SftpException e) {
      logger.error("SftpException in getDirectoryProtocolOutput()", e);
      throw e;
    }
  }

  public Configuration getConf() {
    return configuration;
  }

  public void setConf(Configuration arg0) {
    configuration = arg0;

    server = configuration.get("sftp.server");
    port = configuration.getInt("sftp.port", 22);
    user = configuration.get("sftp.user", "anonymous");
    password = configuration.get("sftp.password", "guest");

    if (server == null) {
      return;
    }
   
    if (channelSftpByHostMap.containsKey(server)) {
      return;
    }

    synchronized (channelSftpByHostMap) {
      if (channelSftpByHostMap.containsKey(server)) {
        return;
      }

      JSch jsch = new JSch();
      Session session = null;
      try {
        session = jsch.getSession(user, server, port);
      } catch (JSchException e) {
        logger.error("Cannot create JSch session for user: " + user
            + ", host: " + server + ", port: " + port);
        return;
      }

      session.setPassword(password);
      Hashtable<String, String> config = new Hashtable<String, String>();
      config.put("StrictHostKeyChecking", "no");
      session.setConfig(config);

      ChannelSftp cSftp = null;
      try {
        session.connect(10000);

        cSftp = (ChannelSftp) session.openChannel("sftp");
        cSftp.connect();
      } catch (JSchException e) {
        logger.error("Cannot connect to JSch session for user: " + user
            + ", host: " + server + ", port: " + port);
        return;
      }

      BlockingQueue<ChannelSftp> queue = new ArrayBlockingQueue<ChannelSftp>(1,
          true);
      try {
        queue.put(cSftp);
      } catch (InterruptedException e) {
        logger.error("Interrupted during setConf()", e);
        return;
      }
      channelSftpByHostMap.put(server, queue);
    }
  }

  /*
   * (non-Javadoc)
   *
   * @see org.apache.nutch.protocol.Protocol#getRobotRules(java.lang.String,
   * org.apache.nutch.storage.WebPage)
   */
  @Override
  public RobotRules getRobotRules(String url, WebPage page) {
    return new RobotRules() {

      @Override
      public boolean isAllowed(URL url) {
        // they're all allowed for now.
        return true;
      }

      @Override
      public long getExpireTime() {
        // set to 0 for never expire
        return 0;
      }

      @Override
      public long getCrawlDelay() {
        // no delay
        return 0;
      }
    };
  }

  /*
   * (non-Javadoc)
   *
   * @see org.apache.nutch.plugin.FieldPluggable#getFields()
   */
  @Override
  public Collection<Field> getFields() {
    return Collections.emptySet();
  }

}
TOP

Related Classes of org.apache.nutch.protocol.sftp.Sftp

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.