Package org.archive.wayback.resourceindex

Source Code of org.archive.wayback.resourceindex.RemoteResourceIndex

/* RemoteBDBResourceIndex
*
* $Id: RemoteResourceIndex.java 2244 2008-04-16 00:40:18Z bradtofel $
*
* Created on 6:06:36 PM Aug 16, 2006.
*
* Copyright (C) 2006 Internet Archive.
*
* This file is part of Wayback.
*
* Wayback is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Wayback is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Wayback; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package org.archive.wayback.resourceindex;

import java.io.File;
import java.io.IOException;
import java.util.logging.Logger;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.archive.wayback.ResourceIndex;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.WaybackConstants;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.SearchResult;
import org.archive.wayback.core.SearchResults;
import org.archive.wayback.core.UrlSearchResults;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.AccessControlException;
import org.archive.wayback.exception.BadQueryException;
import org.archive.wayback.exception.ConfigurationException;
import org.archive.wayback.exception.ResourceIndexNotAvailableException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.archive.wayback.resourceindex.filters.SelfRedirectFilter;
import org.archive.wayback.util.ObjectFilter;
import org.archive.wayback.util.ObjectFilterChain;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/**
* ResourceIndex implementation that relays a query to a remote index
* implementation over HTTP. The XMLQueryUI is assumed to be active on the
* remote server, and the query is sent over as-is, formulated as an OpenSearch
* query. Results are also returned as-is -- this class attempts to be as
* transparent as possible.
*
* @author brad
* @version $Date: 2008-04-16 01:40:18 +0100 (Qua, 16 Abr 2008) $, $Revision: 2244 $
*/
public class RemoteResourceIndex implements ResourceIndex {
  private static final Logger LOGGER = Logger.getLogger(RemoteResourceIndex
      .class.getName());

  private String searchUrlBase;

  private DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

  private static final String WB_XML_REQUEST_TAGNAME = "request";


  private static final String WB_XML_RESULT_TAGNAME = "result";
  private static final String WB_XML_ERROR_TAGNAME = "error";
  private static final String WB_XML_ERROR_TITLE = "title";
  private static final String WB_XML_ERROR_MESSAGE = "message";
  private UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer();

  @SuppressWarnings("unchecked")
  private final ThreadLocal tl = new ThreadLocal() {
        protected synchronized Object initialValue() {
          DocumentBuilder builder = null;
            try {
              if(factory != null) {
          builder = factory.newDocumentBuilder();
          if (!builder.isNamespaceAware()) {
            LOGGER.severe("Builder is not namespace aware.");
          }
              }
      } catch (ParserConfigurationException e) {
        // TODO: OK to just "eat" this error?
        e.printStackTrace();
      }
      return builder;
        }
    };
    private DocumentBuilder getDocumentBuilder() {
        return (DocumentBuilder) tl.get();
    }

    /**
     * @throws ConfigurationException
     */
    public void init() throws ConfigurationException {
    LOGGER.info("initializing RemoteCDXIndex...");

    this.factory.setNamespaceAware(false);
    LOGGER.info("Using base search url " + this.searchUrlBase);   
  }
  /*
   * (non-Javadoc)
   *
   * @see org.archive.wayback.ResourceIndex#query(org.archive.wayback.core.WaybackRequest)
   */
  public SearchResults query(WaybackRequest wbRequest)
    throws ResourceIndexNotAvailableException,
    ResourceNotInArchiveException, BadQueryException,
    AccessControlException {

    return urlToSearchResults(getRequestUrl(wbRequest),
        getSearchResultFilters(wbRequest));
  }

  protected SearchResults urlToSearchResults(String requestUrl,
      ObjectFilter<SearchResult> filter)
      throws ResourceIndexNotAvailableException,
      ResourceNotInArchiveException, BadQueryException,
      AccessControlException {

    Document document = null;
    try {
      // HTTP Request + parse
      LOGGER.info("Getting index XML from ("+requestUrl+")");
      document = getHttpDocument(requestUrl);
    } catch (IOException e) {
      // TODO: better error for user:
      e.printStackTrace();
      throw new ResourceIndexNotAvailableException(e.getMessage());
    } catch (SAXException e) {
      e.printStackTrace();
      throw new ResourceIndexNotAvailableException("Unexpected SAX: "
          + e.getMessage());
    }

    checkDocumentForExceptions(document);
    return documentToSearchResults(document, filter);
  }
 
  protected void checkDocumentForExceptions(Document document)
    throws ResourceIndexNotAvailableException,
    ResourceNotInArchiveException, BadQueryException,
    AccessControlException {

    NodeList errors = document.getElementsByTagName(WB_XML_ERROR_TAGNAME);
    if(errors.getLength() != 0) {
      String errTitle = getNodeContent((Element) errors.item(0),
          WB_XML_ERROR_TITLE);
      String errMessage =  getNodeContent((Element) errors.item(0),
          WB_XML_ERROR_MESSAGE);
     
      // TODO: Localization Problems.. Think of something clever.
      if(errTitle == null) {
        throw new ResourceIndexNotAvailableException("Unknown error!");
      } else if(errTitle.equals("Resource Not In Archive")) {
        throw new ResourceNotInArchiveException(errMessage);
      } else if(errTitle.equals("Bad Query Exception")) {
        throw new BadQueryException(errMessage);
      } else if(errTitle.equals("Resource Index Not Available Exception")) {
        throw new ResourceIndexNotAvailableException(errMessage);
      } else if(errTitle.equals("Access Control Exception")) {
        throw new AccessControlException(errMessage);
      } else {
        throw new ResourceIndexNotAvailableException("Unknown error!");       
      }
    }
  }
  private String getResultsType(Document document) {
    NodeList list = document.getElementsByTagName(
        WaybackConstants.RESULTS_TYPE);
    if(list.getLength() == 1) {
      return list.item(0).getTextContent();
    } else {
      return WaybackConstants.RESULTS_TYPE_CAPTURE;
    }
  }
 
  protected ObjectFilter<SearchResult> getSearchResultFilters(
      WaybackRequest wbRequest) {
    String searchType = wbRequest.get(WaybackConstants.REQUEST_TYPE);
    ObjectFilterChain<SearchResult> filters =
                    new ObjectFilterChain<SearchResult>();
   
    if (searchType.equals(WaybackConstants.REQUEST_REPLAY_QUERY)
        || searchType.equals(WaybackConstants.REQUEST_CLOSEST_QUERY)) {
     
      SelfRedirectFilter selfRedirectFilter = new SelfRedirectFilter();
      selfRedirectFilter.setCanonicalizer(canonicalizer);
      filters.addFilter(selfRedirectFilter);
    } else {
      // no filters for now
      filters = null;
    }
    return filters;
  }
 
  protected SearchResults documentToSearchResults(Document document,
      ObjectFilter<SearchResult> filter) {
    SearchResults results = null;
    NodeList filters = getRequestFilters(document);
    String resultsType = getResultsType(document);
    if(resultsType.equals(WaybackConstants.RESULTS_TYPE_CAPTURE)) {
      results = new CaptureSearchResults();
    } else {
      results = new UrlSearchResults();
    }
    for(int i = 0; i < filters.getLength(); i++) {
      String key = filters.item(i).getNodeName();
      String value = filters.item(i).getTextContent();
      if(!key.equals("#text")) {
        results.putFilter(key,value);
      }
    }
   
    NodeList xresults = getSearchResults(document);
    for(int i = 0; i < xresults.getLength(); i++) {
      Node xresult = xresults.item(i);
      SearchResult result = searchElementToSearchResult(xresult);
     
      int ruling = ObjectFilter.FILTER_INCLUDE;
      if (filter != null) {
        ruling = filter.filterObject(result);
      }
     
      if (ruling == ObjectFilter.FILTER_ABORT) {
        break;
      } else if (ruling == ObjectFilter.FILTER_INCLUDE) {
        results.addSearchResult(result, true);
      }
    }
    return results;
  }

  private SearchResult searchElementToSearchResult(Node e) {

    SearchResult result = new SearchResult();

    NodeList chitlens = e.getChildNodes();
    for(int i = 0; i < chitlens.getLength(); i++) {
      String key = chitlens.item(i).getNodeName();
      String value = chitlens.item(i).getTextContent();
      if(!key.equals("#text")) {
        result.put(key,value);
      }
    }
    return result;
  }

  protected NodeList getRequestFilters(Document d) {
    if (d == null) {
      return null;
    }
    // Jump to the search item list.
    NodeList nodes = d.getElementsByTagName(WB_XML_REQUEST_TAGNAME);
    if(nodes.getLength() != 1) {
      // TODO: warning?
      return null;
    }
    return nodes.item(0).getChildNodes();
  }

  protected NodeList getSearchResults(Document d) {
    if (d == null) {
      return null;
    }
    NodeList nodes = d.getElementsByTagName(WB_XML_RESULT_TAGNAME);
    return (nodes.getLength() <= 0) ? null : nodes;
  }

  protected String getRequestUrl(WaybackRequest wbRequest)
      throws BadQueryException {
    WaybackRequest tmp = wbRequest.clone();
    String type = tmp.get(WaybackConstants.REQUEST_TYPE);
    if(type.equals(WaybackConstants.REQUEST_REPLAY_QUERY)) {
      tmp.put(WaybackConstants.REQUEST_TYPE, WaybackConstants.REQUEST_URL_QUERY);
    }
    return this.searchUrlBase + "?" + tmp.getQueryArguments();
  }

  // extract the text content of a single tag under a node
  protected String getNodeContent(Element e, String key) {
    NodeList nodes = e.getElementsByTagName(key);
    String result = null;
    if (nodes != null && nodes.getLength() > 0) {
      result = nodes.item(0).getTextContent();
    }
    return (result == null || result.length() == 0) ? null : result;
  }

  // do an HTTP request, plus parse the result into an XML DOM
  protected Document getHttpDocument(String url)
      throws IOException, SAXException {
    return (getDocumentBuilder()).parse(url);
  }
  protected Document getFileDocument(File f)
      throws IOException, SAXException {
    return (getDocumentBuilder()).parse(f);
  }

  /**
   * @return the searchUrlBase
   */
  public String getSearchUrlBase() {
    return searchUrlBase;
  }

  /**
   * @param searchUrlBase the searchUrlBase to set
   */
  public void setSearchUrlBase(String searchUrlBase) {
    this.searchUrlBase = searchUrlBase;
  }

  public void shutdown() throws IOException {
    // No-op
  }

  public UrlCanonicalizer getCanonicalizer() {
    return canonicalizer;
  }

  public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
    this.canonicalizer = canonicalizer;
  }
}
TOP

Related Classes of org.archive.wayback.resourceindex.RemoteResourceIndex

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.