Package org.archive.wayback.resourcestore.indexer

Source Code of org.archive.wayback.resourcestore.indexer.HTTPRecordAnnotater

/*
*  This file is part of the Wayback archival access software
*   (http://archive-access.sourceforge.net/projects/wayback/).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.wayback.resourcestore.indexer;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.logging.Logger;

import org.apache.commons.httpclient.Header;
import org.archive.wayback.WaybackConstants;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.util.htmllex.ContextAwareLexer;
import org.archive.wayback.util.htmllex.ParseContext;
import org.archive.wayback.util.htmllex.ParseEventDelegator;
import org.archive.wayback.util.url.UrlOperations;
import org.htmlparser.Node;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;

public class HTTPRecordAnnotater {
  private RobotMetaRule rule = null;
  private ParseEventDelegator rules = null;
  private RobotMetaFlags robotFlags;
  private static final Logger LOGGER =
        Logger.getLogger(HTTPRecordAnnotater.class.getName());
  private static final String UPPER_LOCATION =
      WaybackConstants.LOCATION_HTTP_HEADER.toUpperCase();
  private final static String[] mimes = {
    "html"
  };
  public HTTPRecordAnnotater() {
    rules = new ParseEventDelegator();
    rules.init();
    rule = new RobotMetaRule();
    robotFlags = new RobotMetaFlags();
    rule.setRobotFlags(robotFlags);
    rule.visit(rules);
  }
  public boolean isHTML(String mimeType) {
    String mimeLower = mimeType.toLowerCase();
    for(String mime : mimes) {
      if(mimeLower.contains(mime)) {
        return true;
      }
    }
    return false;
  }

  private String escapeSpaces(final String input) {
    if(input.contains(" ")) {
      return input.replace(" ", "%20");
    }
    return input;
  }
 
  public String transformHTTPMime(String input) {
    if(input == null) {
      return null;
    }
    int semiIdx = input.indexOf(";");
    if(semiIdx > 0) {
      return escapeSpaces(input.substring(0,semiIdx).trim());
    }
    return escapeSpaces(input.trim());
  }
 
  public void annotateHTTPContent(CaptureSearchResult result,
        InputStream is, Header[] headers, String mimeGuess) {
    robotFlags.reset();
    String mimeType = null;
    if (headers != null) {
 
      for (Header httpHeader : headers) {
        if (httpHeader.getName().toUpperCase().equals(
            UPPER_LOCATION)) {
         
          // Old Comment: "Location" is supposed to be absolute:
          // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
          // (section 14.30) but Content-Location can be
          // relative.
          // is it correct to resolve a relative Location, as
          // we are?
          // it's also possible to have both in the HTTP
          // headers...
          // should we prefer one over the other?
          // right now, we're ignoring "Content-Location"
          //
         
          // NOTE: FILLING THE REDIRECT FIELD IN CDX IS DISABLED!
          // If we want to support redirect in cdx as long as the url is valid
          // comment out the following lines:
         
          // String locationStr = httpHeader.getValue();
          // result.setRedirectUrl(
          //    UrlOperations.resolveUrl(result.getOriginalUrl(),
          //        locationStr, "-"));

        } else if(httpHeader.getName().toLowerCase().equals("content-type")) {
          mimeType = transformHTTPMime(httpHeader.getValue());
        } else if(httpHeader.getName().toLowerCase().equals(
            WaybackConstants.X_ROBOTS_HTTP_HEADER)) {

          robotFlags.parse(httpHeader.getValue());
        }
      }
    }
   
    // TODO: get the encoding:
    String encoding = "utf-8";
    if(mimeType == null) {
      // nothing present in the HTTP headers.. Use the WARC field:
      mimeType = transformHTTPMime(mimeGuess);
    }
    if(mimeType == null) {
      mimeType = "unknown";
    }
    result.setMimeType(mimeType);
    // Now the sticky part: If it looks like an HTML document, look for
    // robot meta tags:
    if(isHTML(mimeType)) {
      String fileContext = result.getFile() + ":" + result.getOffset();
      annotateHTMLContent(is, encoding, fileContext, result);
    }
    robotFlags.apply(result);
   
  }
 
  public void annotateHTMLContent(InputStream is, String charSet, String fileContext,
      CaptureSearchResult result) {

    ParseContext context = new ParseContext();
    
      Node node;
      try {
          ContextAwareLexer lex = new ContextAwareLexer(
              new Lexer(new Page(is,charSet)),context);
      while((node = lex.nextNode()) != null) {
//        System.err.println("\nDEBUG-Node:js("+context.isInJS()+")css("+context.isInCSS()+"):");
//        System.err.println("-------------------/START");
//        System.err.println(node.toHtml(true));
//        System.err.println("-------------------/END");
        rules.handleNode(context, node);
      }
      rules.handleParseComplete(context);
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      LOGGER.warning(fileContext + " " + e.getLocalizedMessage());
    } catch (UnsupportedEncodingException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      LOGGER.warning(fileContext + " " + e.getLocalizedMessage());
    } catch (IOException e) {
      LOGGER.warning(fileContext + " " + e.getLocalizedMessage());
    }
  }
}
TOP

Related Classes of org.archive.wayback.resourcestore.indexer.HTTPRecordAnnotater

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.