Package com.subgraph.vega.impl.scanner.handlers

Source Code of com.subgraph.vega.impl.scanner.handlers.UnknownProcessor

/*******************************************************************************
* Copyright (c) 2011 Subgraph.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
*     Subgraph - initial API and implementation
******************************************************************************/
package com.subgraph.vega.impl.scanner.handlers;

import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpUriRequest;

import com.subgraph.vega.api.crawler.ICrawlerResponseProcessor;
import com.subgraph.vega.api.crawler.IWebCrawler;
import com.subgraph.vega.api.http.requests.IHttpResponse;
import com.subgraph.vega.api.http.requests.IPageFingerprint;
import com.subgraph.vega.api.model.web.IWebPath;
import com.subgraph.vega.api.model.web.IWebPath.PathType;
import com.subgraph.vega.api.scanner.IInjectionModuleContext;
import com.subgraph.vega.api.scanner.IPathState;

public class UnknownProcessor implements ICrawlerResponseProcessor {
  private final ParametricCheckHandler parametricChecks = new ParametricCheckHandler();
  private final ICrawlerResponseProcessor fetchFileProcessor = new FileProcessor();
  private final ICrawlerResponseProcessor fetchDirProcessor = new DirectoryProcessor();

  @Override
  public void processResponse(IWebCrawler crawler, HttpUriRequest request,
      IHttpResponse response, Object argument) {
    final IInjectionModuleContext ctx = (IInjectionModuleContext) argument;
    final IPathState ps = ctx.getPathState();
    if(ctx.getCurrentIndex() == 0) {
      processInitialResponse(request, response, ctx, ps);
    } else {
      processProbeResponses(request, response, ctx, ps);
    }
  }

  private void processInitialResponse(HttpUriRequest request, IHttpResponse response, IInjectionModuleContext ctx, IPathState ps) {
    ps.setResponse(response);
    if(response.isFetchFail()) {
      ctx.error(request, response, "during initial resource fetch");
      return;
    }
    ps.getPath().setVisited(true);
    final IPageFingerprint fp = response.getPageFingerprint();
    final IPathState par = ps.get404Parent();
    final int rcode = response.getResponseCode();
    if((par == null && rcode == 404) || (ps.hasParent404Fingerprint(fp))) {
      ps.setPageMissing();
      ps.unlockChildren();
      ctx.debug("Starting parametric checks on unknown path because page is missing.");
      parametricChecks.initialize(ps);
      return;
    }

    if(par != null && !response.getBodyAsString().isEmpty() && rcode == 200 && fp.isSame(par.getUnknownFingerprint())) {
      ctx.debug("Unknown path fetch matches parent unknown fp, processing as a file.");
      ps.getPath().setPathType(PathType.PATH_FILE);
      fetchFileProcessor.processResponse(null, request, response, ctx);
      return;
    }

    if(par != null && rcode >= 300 && rcode < 400 && fp.isSame(par.getUnknownFingerprint()) && fp.isSame(par.getPathFingerprint())) {
      ctx.debug("Unknown path fetch matches both parent probes, processing as file.");
      ps.getPath().setPathType(PathType.PATH_FILE);
      fetchFileProcessor.processResponse(null, request, response, ctx);
      return;
    }

    ctx.debug("Sending probes to resolve unknown path.");
    sendProbeRequests(ps);
  }

  private void sendProbeRequests(IPathState ps) {
    final IInjectionModuleContext ctx = ps.createModuleContext();
    ctx.submitAlteredRequest(this, "/", 1);
    ctx.submitAlteredRequest(this, "/abc123/", 2);
  }

  private void processProbeResponses(HttpUriRequest request, IHttpResponse response, IInjectionModuleContext ctx, IPathState ps) {
    if(response.isFetchFail()) {
      ctx.error(request, response, "Fetch failed processing unknown path probe responses");
      callFetchHandler(ctx, ps);
    }
    ctx.addRequestResponse(request, response);
    ctx.incrementResponseCount();
    if(ctx.allResponsesReceived())
      analyzeResponses(ctx, ps);
  }

  private void analyzeResponses(IInjectionModuleContext ctx, IPathState ps) {

    // http://host.com/foo/bar.php/ vs. http://host.com/foo/bar.php/abc123/ vs http://host.com/foo/bar.php
    if(ctx.isFingerprintMatch(1, 2) && ctx.isFingerprintMatch(2, ps.getPathFingerprint())) {
      ctx.debug("Probes all match for unknown path, processing as file.");
      ps.getPath().setPathType(PathType.PATH_FILE);
      callFetchHandler(ctx, ps);
      return;
    }

    final IHttpResponse res1 = ctx.getSavedResponse(1);
    if(ctx.isFingerprintMatch(1, ps.getPathFingerprint())) {
      ctx.debug("Trailing / probe matches initial path fetch, processing as directory.");
      assumeDirectory(res1, ctx, ps);
      return;
    }

    // pivot code, response code
    final int pcode = ps.getResponse().getResponseCode();
    final int rcode = res1.getResponseCode();
    if(pcode >= 300 && pcode < 400) {
      if(hasLocationHeaderWithRequestUri(ps, ctx.getSavedRequest(1))) {
        ctx.debug("Trailing slash probe matches redirect from initial fetch, processing as directory");
        ps.setSureDirectory();
        assumeDirectory(res1, ctx, ps);
        return;
      }
    }

    if(isProbe404(ps, res1)) {
      ctx.debug("Trailing slash probe looks like a 404, processing as a file");
      ps.getPath().setPathType(PathType.PATH_FILE);
    } else if(pcode  < 300 && rcode >= 300 && ps.getResponse().getBodyAsString().length() > 0) {
      ctx.debug("Trailing slash probe returned code 3xx - 5xx and initial fetch was a 200, processing as file");
      ps.getPath().setPathType(PathType.PATH_FILE);
    } else {
      ctx.debug("No idea what this is, try processing as file");
    }
    callFetchHandler(ctx, ps);
  }

  private boolean isProbe404(IPathState ps, IHttpResponse response) {
    final IPathState p404 = ps.get404Parent();
    final int rcode = response.getResponseCode();
    final IPageFingerprint rfp = response.getPageFingerprint();
    if(p404 != null)
      return p404.has404FingerprintMatching(rfp);
    else
      return rcode == 404;
  }

  private boolean hasLocationHeaderWithRequestUri(IPathState ps, HttpUriRequest req) {
    final String locationValue = getLocationHeader(ps);
    if(locationValue == null)
      return false;
    final String uriString = req.getURI().toString();
    return locationValue.equalsIgnoreCase(uriString);

  }
  private String getLocationHeader(IPathState ps) {
    final HttpResponse rr = ps.getResponse().getRawResponse();
    final Header lh = rr.getFirstHeader("Location");
    if(lh == null)
      return null;
    return lh.getValue();
  }

  private void assumeDirectory(IHttpResponse response, IInjectionModuleContext ctx, IPathState ps) {
    ps.getPath().setPathType(PathType.PATH_DIRECTORY);
    ps.setUnknownFingerprint(ps.getPathFingerprint());
    ps.setResponse(response);
    callFetchHandler(ctx, ps);
  }

  private void callFetchHandler(IInjectionModuleContext ctx, IPathState ps) {
    final IWebPath path = ps.getPath();
    final HttpUriRequest req = ps.createRequest();
    final IHttpResponse res = ps.getResponse();
    if(path.getPathType() == PathType.PATH_DIRECTORY || path.getParentPath() == null)
      fetchDirProcessor.processResponse(null, req, res, ctx);
    else
      fetchFileProcessor.processResponse(null, req, res, ctx);
  }
 
  @Override
  public void processException(HttpUriRequest request, Throwable ex, Object argument) {
    final IInjectionModuleContext ctx = (IInjectionModuleContext) argument;
    ctx.reportRequestException(request, ex);
  }
}
TOP

Related Classes of com.subgraph.vega.impl.scanner.handlers.UnknownProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.