Package net.matuschek.spider

Source Code of net.matuschek.spider.RegExpURLCheck

package net.matuschek.spider;

/************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/


import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.net.URL;
import java.util.StringTokenizer;
import java.util.Vector;

import org.apache.regexp.RESyntaxException;


/**
* This URLChecker checks a URL using a list of regular expressions
* that should be allowed or denied.
*
* @author Daniel Matuschek
* @version $Revision: 1.4 $
*/
public class RegExpURLCheck
implements URLCheck
{
  /** vector to store the rules */
  private Vector<RegExpRule> rules = null;

  /** default check result if no matching regexp was found */
  private boolean defaultResult = true;

  /** initializes the object with an empty rule set */
  public RegExpURLCheck() {
    rules = new Vector<RegExpRule>();
  }

  /**
   * <p>initialized the object with a rule set from an
   * input stream (e.g. a file)</p>
   *
   * <p>every line of this stream has the format
   * <code>allow|deny expression</code></p>
   *
   * <p>default value can be set with
   * <code>allow|deny .</code> at the end of the file</p>
  
   * <p>lines that start with "#" and empty lines will be
   * ignored</p>
   */
  public RegExpURLCheck(Reader r)
  throws IOException,
  org.apache.regexp.RESyntaxException
  {
    this();

    BufferedReader reader =
      new BufferedReader(r);

    String line = "";
    int lineno=0;

    while (line != null) {
      line=reader.readLine();
      lineno++;

      if ((line != null) &&
          (! line.trim().equals("")) &&
          (! line.startsWith("#"))) {
        StringTokenizer st = new StringTokenizer(line);
        // did we get 2 tokens ?
        if (st.countTokens() != 2) {
          throw new IOException("line "+lineno+" don't consists of 2 fields");
        }

        String allowStr = st.nextToken();
        boolean allow = true;
        String expression = st.nextToken();

        // allow or deny ?
        if (allowStr.equalsIgnoreCase("allow")) {
          allow=true;
        } else if (allowStr.equalsIgnoreCase("deny")) {
          allow=false;
        } else {
          throw new IOException("first token in line "+lineno+
          " has to be allow or deny");
        }

        addRule(expression,allow);
      }
    }
  }


  /**
   * Sets the default result that will be returned if no matching
   * regular expression was found
   * @param default the default result
   */
  public void setDefaultResult(boolean defaultResult) {
    this.defaultResult = defaultResult;
  }

  /**
   * Gets the default result that will be returned if no matching
   * regular expression was found
   * @return the default result
   */
  public boolean getDefaultResult() {
    return defaultResult;
  }

  /**
   * Gets the list of rules
   * @return a vector of RegExpRule objects
   */
  public Vector getRules() {
    return rules;
  }

  /**
   * Sets the list of rules
   * @param rules a vector of RegExpRule objects
   */
  public void setRules(Vector<RegExpRule> rules) {
    this.rules=rules;
  }


  /**
   * adds a allow or deny rule
   * @param regExp a String containing the regular expression
   * @param allow allow (TRUE) or deny (FALSE)
   */
  public void addRule(String regExp, boolean allow)
  throws RESyntaxException
  {
    RegExpRule rule = new RegExpRule();
    rule.setPattern(regExp);
    rule.setAllow(allow);
    rules.add(rule);
  }


  /**
   * Checks if a given URL is allowed or denied by the rules
   *
   * @return true if a matching "allow" rule was found,
   * false if a matching "deny" rule was found,
   * the default value if no rule was found
   * @see #setDefaultResult(boolean)
   */
  public boolean checkURL(URL u) {
    String urlStr = u.toString();

    for (int i=0; i<rules.size(); i++) {
      RegExpRule rule = rules.elementAt(i);

      if (rule.match(urlStr)) {
        return rule.getAllow();
      }
    }

    return defaultResult;
  }

  /**
   * Checks if a given URL is allowed or denied by the rules for processing
   *
   * @return true if a matching "allow" rule was found,
   * false if a matching "deny" rule was found,
   * the default value if no rule was found
   * @see #setDefaultResult(boolean)
   */
  public boolean checkURLForProcessing(URL u) {
    String urlStr = u.toString();

    for (int i=0; i<rules.size(); i++) {
      RegExpRule rule = rules.elementAt(i);

      if (rule.match(urlStr)) {
        return rule.getProcessAllowed();
      }
    }

    return defaultResult;
  }

} // RegExpURLCheck
TOP

Related Classes of net.matuschek.spider.RegExpURLCheck

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.