Package org.apache.nutch.protocol.file

Source Code of org.apache.nutch.protocol.file.FileResponse

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.protocol.file;

// JDK imports
import java.net.URL;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;

// Tika imports
import org.apache.tika.Tika;

// Hadoop imports
import org.apache.hadoop.conf.Configuration;

/************************************
* FileResponse.java mimics file replies as http response. It tries its best to
* follow http's way for headers, response codes as well as exceptions.
*
* Comments: (1) java.net.URL and java.net.URLConnection can handle file:
* scheme. However they are not flexible enough, so not used in this
* implementation.
*
* (2) java.io.File is used for its abstractness across platforms. Warning:
* java.io.File API (1.4.2) does not elaborate on how special files, such as
* /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
* java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
* return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
* probably oaky for now. Could be buggy here. How about special files on
* windows?
*
* (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
* are just treated as individual files.
*
* (4) No funcy POSIX file attributes yet. May never need?
*
* @author John Xing
***********************************/
public class FileResponse {

  private String orig;
  private String base;
  private byte[] content;
  private static final byte[] EMPTY_CONTENT = new byte[0];
  private int code;
  private Metadata headers = new Metadata();

  private final File file;
  private Configuration conf;

  private MimeUtil MIME;
  private Tika tika;

  /** Returns the response code. */
  public int getCode() {
    return code;
  }

  /** Returns the value of a named header. */
  public String getHeader(String name) {
    return headers.get(name);
  }

  public byte[] getContent() {
    return content;
  }

  public Content toContent() {
    return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
        getHeader(Response.CONTENT_TYPE), headers, this.conf);
  }

  /**
   * Default public constructor
   * @param url
   * @param datum
   * @param file
   * @param conf
   * @throws FileException
   * @throws IOException
   */
  public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
    throws FileException, IOException {

    this.orig = url.toString();
    this.base = url.toString();
    this.file = file;
    this.conf = conf;
   
    MIME = new MimeUtil(conf);
    tika = new Tika();

    if (!"file".equals(url.getProtocol()))
      throw new FileException("Not a file url:" + url);

    if (File.LOG.isTraceEnabled()) {
      File.LOG.trace("fetching " + url);
    }

    if (url.getPath() != url.getFile()) {
      if (File.LOG.isWarnEnabled()) {
        File.LOG.warn("url.getPath() != url.getFile(): " + url);
      }
    }

    String path = "".equals(url.getPath()) ? "/" : url.getPath();

    try {
      // specify the encoding via the config later?
      path = java.net.URLDecoder.decode(path, "UTF-8");
    } catch (UnsupportedEncodingException ex) {
    }

    try {

      this.content = null;

      // url.toURI() is only in j2se 1.5.0
      //java.io.File f = new java.io.File(url.toURI());
      java.io.File f = new java.io.File(path);

      if (!f.exists()) {
        this.code = 404// http Not Found
        return;
      }

      if (!f.canRead()) {
        this.code = 401// http Unauthorized
        return;
      }

      // symbolic link or relative path on unix
      // fix me: what's the consequence on windows platform
      // where case is insensitive
      if (!f.equals(f.getCanonicalFile())) {
        // set headers
        //hdrs.put("Location", f.getCanonicalFile().toURI());
        //
        // we want to automatically escape characters that are illegal in URLs.
        // It is recommended that new code convert an abstract pathname into a URL
        // by first converting it into a URI, via the toURI method, and then
        // converting the URI into a URL via the URI.toURL method.
        headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());

        this.code = 300// http redirect
        return;
      }
      if (f.lastModified() <= datum.getModifiedTime()) {
        this.code = 304;
        this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
        return;
      }

      if (f.isDirectory()) {
        getDirAsHttpResponse(f);
      } else if (f.isFile()) {
        getFileAsHttpResponse(f);
      } else {
        this.code = 500; // http Internal Server Error
        return;
      }

    } catch (IOException e) {
      throw e;
    }

  }

  // get file as http response
  private void getFileAsHttpResponse(java.io.File f) throws FileException,
      IOException {

    // ignore file of size larger than
    // Integer.MAX_VALUE = 2^31-1 = 2147483647
    long size = f.length();
    if (size > Integer.MAX_VALUE) {
      throw new FileException("file is too large, size: " + size);
      // or we can do this?
      // this.code = 400; // http Bad request
      // return;
    }

    // capture content
    int len = (int) size;

    if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
      len = this.file.maxContentLength;

    this.content = new byte[len];

    java.io.InputStream is = new java.io.FileInputStream(f);
    int offset = 0;
    int n = 0;
    while (offset < len
        && (n = is.read(this.content, offset, len - offset)) >= 0) {
      offset += n;
    }
    if (offset < len) { // keep whatever already have, but issue a warning
      if (File.LOG.isWarnEnabled()) {
        File.LOG.warn("not enough bytes read from file: " + f.getPath());
      }
    }
    is.close();

    // set headers
    headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
    headers.set(Response.LAST_MODIFIED,
        HttpDateFormat.toString(f.lastModified()));

    String mimeType = tika.detect(f);

    headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");

    // response code
    this.code = 200; // http OK
  }

  /**
   * get dir list as http response
   * @param f
   * @throws IOException
   */
  private void getDirAsHttpResponse(java.io.File f) throws IOException {

    String path = f.toString();
    if (this.file.crawlParents)
      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
          : true);
    else
      this.content = list2html(f.listFiles(), path, false);

    // set headers
    headers.set(Response.CONTENT_LENGTH,
        new Integer(this.content.length).toString());
    headers.set(Response.CONTENT_TYPE, "text/html");
    headers.set(Response.LAST_MODIFIED,
        HttpDateFormat.toString(f.lastModified()));

    // response code
    this.code = 200; // http OK
  }

  /**
   * generate html page from dir list
   * @param list
   * @param path
   * @param includeDotDot
   * @return
   */
  private byte[] list2html(java.io.File[] list, String path,
      boolean includeDotDot) {

    StringBuffer x = new StringBuffer("<html><head>");
    x.append("<title>Index of " + path + "</title></head>\n");
    x.append("<body><h1>Index of " + path + "</h1><pre>\n");

    if (includeDotDot) {
      x.append("<a href='../'>../</a>\t-\t-\t-\n");
    }

    // fix me: we might want to sort list here! but not now.

    java.io.File f;
    for (int i = 0; i < list.length; i++) {
      f = list[i];
      String name = f.getName();
      String time = HttpDateFormat.toString(f.lastModified());
      if (f.isDirectory()) {
        // java 1.4.2 api says dir itself and parent dir are not listed
        // so the following is not needed.
        // if (name.equals(".") || name.equals(".."))
        // continue;
        x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
        x.append(time + "\t-\n");
      } else if (f.isFile()) {
        x.append("<a href='" + name + "'>" + name + "</a>\t");
        x.append(time + "\t" + f.length() + "\n");
      } else {
        // ignore any other
      }
    }

    x.append("</pre></body></html>\n");

    return new String(x).getBytes();
  }

}
TOP

Related Classes of org.apache.nutch.protocol.file.FileResponse

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.