Package org.apache.nutch.tools.proxy

Source Code of org.apache.nutch.tools.proxy.SegmentHandler

package org.apache.nutch.tools.proxy;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServletResponse;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolStatus;
import org.mortbay.jetty.Request;

/**
* XXX should turn this into a plugin?
*/
public class SegmentHandler extends AbstractTestbedHandler {
  private static final Logger LOG = LoggerFactory.getLogger(SegmentHandler.class);
  private Segment seg;
 
  private static HashMap<Integer,Integer> protoCodes = new HashMap<Integer,Integer>();
 
  static {
    protoCodes.put(ProtocolStatus.ACCESS_DENIED, HttpServletResponse.SC_UNAUTHORIZED);
    protoCodes.put(ProtocolStatus.BLOCKED, HttpServletResponse.SC_SERVICE_UNAVAILABLE);
    protoCodes.put(ProtocolStatus.EXCEPTION, HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
    protoCodes.put(ProtocolStatus.FAILED, HttpServletResponse.SC_BAD_REQUEST);
    protoCodes.put(ProtocolStatus.GONE, HttpServletResponse.SC_GONE);
    protoCodes.put(ProtocolStatus.MOVED, HttpServletResponse.SC_MOVED_PERMANENTLY);
    protoCodes.put(ProtocolStatus.NOTFETCHING, HttpServletResponse.SC_BAD_REQUEST);
    protoCodes.put(ProtocolStatus.NOTFOUND, HttpServletResponse.SC_NOT_FOUND);
    protoCodes.put(ProtocolStatus.NOTMODIFIED, HttpServletResponse.SC_NOT_MODIFIED);
    protoCodes.put(ProtocolStatus.PROTO_NOT_FOUND, HttpServletResponse.SC_BAD_REQUEST);
    protoCodes.put(ProtocolStatus.REDIR_EXCEEDED, HttpServletResponse.SC_BAD_REQUEST);
    protoCodes.put(ProtocolStatus.RETRY, HttpServletResponse.SC_BAD_REQUEST);
    protoCodes.put(ProtocolStatus.ROBOTS_DENIED, HttpServletResponse.SC_FORBIDDEN);
    protoCodes.put(ProtocolStatus.SUCCESS, HttpServletResponse.SC_OK);
    protoCodes.put(ProtocolStatus.TEMP_MOVED, HttpServletResponse.SC_MOVED_TEMPORARILY);
    protoCodes.put(ProtocolStatus.WOULDBLOCK, HttpServletResponse.SC_BAD_REQUEST);
  }
 
  private static class SegmentPathFilter implements PathFilter {
    public static final SegmentPathFilter INSTANCE = new SegmentPathFilter();
   
    @Override
    public boolean accept(Path p) {
      return p.getName().startsWith("part-");
    }
   
  }
 
  private static class Segment implements Closeable {
   
    private static final Partitioner<Text,Writable> PARTITIONER = new HashPartitioner<Text,Writable>();

    private Path segmentDir;

    private Object cLock = new Object();
    private Object crawlLock = new Object();
    private MapFile.Reader[] content;
    private MapFile.Reader[] parseText;
    private MapFile.Reader[] parseData;
    private MapFile.Reader[] crawl;
    private Configuration conf;

    public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException {
      this.segmentDir = segmentDir;
      this.conf = conf;
    }

    public CrawlDatum getCrawlDatum(Text url) throws IOException {
      synchronized (crawlLock) {
        if (crawl == null)
          crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
      }
      return (CrawlDatum)getEntry(crawl, url, new CrawlDatum());
    }
   
    public Content getContent(Text url) throws IOException {
      synchronized (cLock) {
        if (content == null)
          content = getReaders(Content.DIR_NAME);
      }
      return (Content)getEntry(content, url, new Content());
    }

    /** Open the output generated by this format. */
    private MapFile.Reader[] getReaders(String subDir) throws IOException {
      Path dir = new Path(segmentDir, subDir);
      FileSystem fs = dir.getFileSystem(conf);
      Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, SegmentPathFilter.INSTANCE));

      // sort names, so that hash partitioning works
      Arrays.sort(names);
     
      MapFile.Reader[] parts = new MapFile.Reader[names.length];
      for (int i = 0; i < names.length; i++) {
        parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
      }
      return parts;
    }
   
    private Writable getEntry(MapFile.Reader[] readers, Text url,
                              Writable entry) throws IOException {
      return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
    }

    public void close() throws IOException {
      if (content != null) { closeReaders(content); }
      if (parseText != null) { closeReaders(parseText); }
      if (parseData != null) { closeReaders(parseData); }
      if (crawl != null) { closeReaders(crawl); }
    }

    private void closeReaders(MapFile.Reader[] readers) throws IOException {
      for (int i = 0; i < readers.length; i++) {
        readers[i].close();
      }
    }

  }
 
  public SegmentHandler(Configuration conf, Path name) throws Exception {
    seg = new Segment(FileSystem.get(conf), name, conf);
  }

  @Override
  public void handle(Request req, HttpServletResponse res, String target,
          int dispatch) throws IOException, ServletException {
    try {
      String uri = req.getUri().toString();
      LOG.info("URI: " + uri);
      addMyHeader(res, "URI", uri);
      Text url = new Text(uri.toString());
      CrawlDatum cd = seg.getCrawlDatum(url);
      if (cd != null) {
        addMyHeader(res, "Res", "found");
        LOG.info("-got " + cd.toString());
        ProtocolStatus ps = (ProtocolStatus)cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
        if (ps != null) {
          Integer TrCode = protoCodes.get(ps.getCode());
          if (TrCode != null) {
            res.setStatus(TrCode.intValue());           
          } else {
            res.setStatus(HttpServletResponse.SC_OK);
          }
          addMyHeader(res, "ProtocolStatus", ps.toString());
        } else {
          res.setStatus(HttpServletResponse.SC_OK);         
        }
        Content c = seg.getContent(url);
        if (c == null) { // missing content
          req.setHandled(true);
          res.addHeader("X-Handled-By", getClass().getSimpleName());
          return;
        }
        byte[] data = c.getContent();
        LOG.debug("-data len=" + data.length);
        Metadata meta = c.getMetadata();
        String[] names = meta.names();
        LOG.debug("- " + names.length + " meta");
        for (int i = 0; i < names.length; i++) {
          boolean my = true;
          char ch = names[i].charAt(0);
          if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
            // pretty good chance it's a standard header
            my = false;
          }
          String[] values = meta.getValues(names[i]);
          for (int k = 0; k < values.length; k++) {
            if (my) {
              addMyHeader(res, names[i], values[k]);
            } else {
              res.addHeader(names[i], values[k]);
            }
          }
        }
        req.setHandled(true);
        res.addHeader("X-Handled-By", getClass().getSimpleName());
        res.setContentType(meta.get(Metadata.CONTENT_TYPE));
        res.setContentLength(data.length);
        OutputStream os = res.getOutputStream();
        os.write(data, 0, data.length);
        res.flushBuffer();
      } else {
        addMyHeader(res, "Res", "not found");
        LOG.info(" -not found " + url);
      }
    } catch (Exception e) {
      e.printStackTrace();
      LOG.warn(StringUtils.stringifyException(e));
      addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
    }
  }

}
TOP

Related Classes of org.apache.nutch.tools.proxy.SegmentHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.
ore(a,m) })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); ga('create', 'UA-20639858-1', 'auto'); ga('send', 'pageview'); s reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.