Package net.sf.regain.crawler.config

Examples of net.sf.regain.crawler.config.UrlMatcherResult


   * @param url Die zu prüfende URL.
   * @return Ob die URL von der Schwarzen und Weißen Liste akzeptiert wird.
   */
  public UrlMatcher isUrlAccepted(String url) {
   
    UrlMatcher urlMatchResult = new UrlMatcherResult(false, false);
    mLog.debug("isUrlAccepted for url: " + url);
    // check whether this URL matches to a white list prefix
    for (int i = 0; i < mWhiteListEntryArr.length; i++) {
      if (mWhiteListEntryArr[i].shouldBeUpdated()) {
        UrlMatcher matcher = mWhiteListEntryArr[i].getUrlMatcher();
        if (matcher.matches(url)) {
          // get the values for link extraction and indexing
          // from the current matcher hit
          urlMatchResult.setShouldBeParsed(matcher.getShouldBeParsed());
          urlMatchResult.setShouldBeIndexed(matcher.getShouldBeIndexed());
          mLog.debug("Whitelist matches for url: " + url);
          break;
        }
      }
    }

    // check whether this URL matches to a black list prefix
    // check only if there was a whitelist-hit
    if( urlMatchResult.getShouldBeParsed() || urlMatchResult.getShouldBeIndexed() ) {
      for (int i = 0; i < mBlackListArr.length; i++) {
        if (mBlackListArr[i].matches(url)) {
          urlMatchResult.setShouldBeParsed(false);
          urlMatchResult.setShouldBeIndexed(false);
          mLog.debug("Blacklist matches for url: " + url);
        }
      }
    }

View Full Code Here

TOP

Related Classes of net.sf.regain.crawler.config.UrlMatcherResult

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.