Source Code of org.apache.nutch.protocol.file.FileResponse

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.protocol.file;


// JDK imports
import java.net.URL;
import java.io.IOException;
import java.io.UnsupportedEncodingException;


// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;


// Tika imports
import org.apache.tika.Tika;


// Hadoop imports
import org.apache.hadoop.conf.Configuration;


/************************************
 * FileResponse.java mimics file replies as http response. It tries its best to
 * follow http's way for headers, response codes as well as exceptions.
 * 
 * Comments: (1) java.net.URL and java.net.URLConnection can handle file:
 * scheme. However they are not flexible enough, so not used in this
 * implementation.
 * 
 * (2) java.io.File is used for its abstractness across platforms. Warning:
 * java.io.File API (1.4.2) does not elaborate on how special files, such as
 * /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
 * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
 * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
 * probably oaky for now. Could be buggy here. How about special files on
 * windows?
 * 
 * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
 * are just treated as individual files.
 * 
 * (4) No funcy POSIX file attributes yet. May never need?
 * 
 * @author John Xing
 ***********************************/
public class FileResponse {


  private String orig;
  private String base;
  private byte[] content;
  private static final byte[] EMPTY_CONTENT = new byte[0];
  private int code;
  private Metadata headers = new Metadata();


  private final File file;
  private Configuration conf;


  private MimeUtil MIME;
  private Tika tika;


  /** Returns the response code. */
  public int getCode() {
    return code;
  }


  /** Returns the value of a named header. */
  public String getHeader(String name) {
    return headers.get(name);
  }


  public byte[] getContent() {
    return content;
  }


  public Content toContent() {
    return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
        getHeader(Response.CONTENT_TYPE), headers, this.conf);
  }


  /**
   * Default public constructor
   * @param url
   * @param datum
   * @param file
   * @param conf
   * @throws FileException
   * @throws IOException
   */
  public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
    throws FileException, IOException {


    this.orig = url.toString();
    this.base = url.toString();
    this.file = file;
    this.conf = conf;
    
    MIME = new MimeUtil(conf);
    tika = new Tika();


    if (!"file".equals(url.getProtocol()))
      throw new FileException("Not a file url:" + url);


    if (File.LOG.isTraceEnabled()) {
      File.LOG.trace("fetching " + url);
    }


    if (url.getPath() != url.getFile()) {
      if (File.LOG.isWarnEnabled()) {
        File.LOG.warn("url.getPath() != url.getFile(): " + url);
      }
    }


    String path = "".equals(url.getPath()) ? "/" : url.getPath();


    try {
      // specify the encoding via the config later?
      path = java.net.URLDecoder.decode(path, "UTF-8");
    } catch (UnsupportedEncodingException ex) {
    }


    try {


      this.content = null;


      // url.toURI() is only in j2se 1.5.0
      //java.io.File f = new java.io.File(url.toURI());
      java.io.File f = new java.io.File(path);


      if (!f.exists()) {
        this.code = 404;  // http Not Found
        return;
      }


      if (!f.canRead()) {
        this.code = 401;  // http Unauthorized
        return;
      }


      // symbolic link or relative path on unix
      // fix me: what's the consequence on windows platform
      // where case is insensitive
      if (!f.equals(f.getCanonicalFile())) {
        // set headers
        //hdrs.put("Location", f.getCanonicalFile().toURI());
        //
        // we want to automatically escape characters that are illegal in URLs. 
        // It is recommended that new code convert an abstract pathname into a URL 
        // by first converting it into a URI, via the toURI method, and then 
        // converting the URI into a URL via the URI.toURL method.
        headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());


        this.code = 300;  // http redirect
        return;
      }
      if (f.lastModified() <= datum.getModifiedTime()) {
        this.code = 304;
        this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
        return;
      }


      if (f.isDirectory()) {
        getDirAsHttpResponse(f);
      } else if (f.isFile()) {
        getFileAsHttpResponse(f);
      } else {
        this.code = 500; // http Internal Server Error
        return;
      }


    } catch (IOException e) {
      throw e;
    }


  }


  // get file as http response
  private void getFileAsHttpResponse(java.io.File f) throws FileException,
      IOException {


    // ignore file of size larger than
    // Integer.MAX_VALUE = 2^31-1 = 2147483647
    long size = f.length();
    if (size > Integer.MAX_VALUE) {
      throw new FileException("file is too large, size: " + size);
      // or we can do this?
      // this.code = 400; // http Bad request
      // return;
    }


    // capture content
    int len = (int) size;


    if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
      len = this.file.maxContentLength;


    this.content = new byte[len];


    java.io.InputStream is = new java.io.FileInputStream(f);
    int offset = 0;
    int n = 0;
    while (offset < len
        && (n = is.read(this.content, offset, len - offset)) >= 0) {
      offset += n;
    }
    if (offset < len) { // keep whatever already have, but issue a warning
      if (File.LOG.isWarnEnabled()) {
        File.LOG.warn("not enough bytes read from file: " + f.getPath());
      }
    }
    is.close();


    // set headers
    headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
    headers.set(Response.LAST_MODIFIED,
        HttpDateFormat.toString(f.lastModified()));


    String mimeType = tika.detect(f);


    headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");


    // response code
    this.code = 200; // http OK
  }


  /**
   * get dir list as http response
   * @param f
   * @throws IOException
   */
  private void getDirAsHttpResponse(java.io.File f) throws IOException {


    String path = f.toString();
    if (this.file.crawlParents)
      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
          : true);
    else
      this.content = list2html(f.listFiles(), path, false);


    // set headers
    headers.set(Response.CONTENT_LENGTH,
        new Integer(this.content.length).toString());
    headers.set(Response.CONTENT_TYPE, "text/html");
    headers.set(Response.LAST_MODIFIED,
        HttpDateFormat.toString(f.lastModified()));


    // response code
    this.code = 200; // http OK
  }


  /**
   * generate html page from dir list
   * @param list
   * @param path
   * @param includeDotDot
   * @return
   */
  private byte[] list2html(java.io.File[] list, String path,
      boolean includeDotDot) {


    StringBuffer x = new StringBuffer("<html><head>");
    x.append("<title>Index of " + path + "</title></head>\n");
    x.append("<body><h1>Index of " + path + "</h1><pre>\n");


    if (includeDotDot) {
      x.append("<a href='../'>../</a>\t-\t-\t-\n");
    }


    // fix me: we might want to sort list here! but not now.


    java.io.File f;
    for (int i = 0; i < list.length; i++) {
      f = list[i];
      String name = f.getName();
      String time = HttpDateFormat.toString(f.lastModified());
      if (f.isDirectory()) {
        // java 1.4.2 api says dir itself and parent dir are not listed
        // so the following is not needed.
        // if (name.equals(".") || name.equals(".."))
        // continue;
        x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
        x.append(time + "\t-\n");
      } else if (f.isFile()) {
        x.append("<a href='" + name + "'>" + name + "</a>\t");
        x.append(time + "\t" + f.length() + "\n");
      } else {
        // ignore any other
      }
    }


    x.append("</pre></body></html>\n");


    return new String(x).getBytes();
  }


}
Source Code of org.apache.nutch.protocol.file.FileResponse

Related Classes of org.apache.nutch.protocol.file.FileResponse