Package net.sf.regain.crawler.config

Examples of net.sf.regain.crawler.config.UrlMatcher


    if ((! alreadyAccepted) && (! alreadyIgnored)) {
      // Check whether the url matches an entry in the whitelist and not an entry in the blacklist
      // We assume that the caller of addJob() detected the correct values for shouldBeParsed
      // and shouldBeIndexed.
      UrlMatcher urlMatch = mUrlChecker.isUrlAccepted(url);
      boolean accepted;
      if( urlMatch.getShouldBeParsed() || urlMatch.getShouldBeIndexed() )
        accepted = true;
      else
        accepted = false;
     
      int mMaxCycleCount = mConfiguration.getMaxCycleCount();
View Full Code Here


    if( rawDocument.hasLinks() ){
      // Iterate over all found links in the document
      for (Iterator iter = rawDocument.getLinks().entrySet().iterator(); iter.hasNext();){
        Map.Entry entry = (Map.Entry)iter.next();
        // The intention of this call is only to determine the link-extraction and indexing property
        UrlMatcher urlMatch = mUrlChecker.isUrlAccepted((String)entry.getKey());
        // Add the job
        addJob((String)entry.getKey(), rawDocument.getUrl(),
          urlMatch.getShouldBeParsed(), urlMatch.getShouldBeIndexed(), (String)entry.getValue());
      }
    }
  }
View Full Code Here

   * @param url Die zu prüfende URL.
   * @return Ob die URL von der Schwarzen und Weißen Liste akzeptiert wird.
   */
  public UrlMatcher isUrlAccepted(String url) {
   
    UrlMatcher urlMatchResult = new UrlMatcherResult(false, false);
    mLog.debug("isUrlAccepted for url: " + url);
    // check whether this URL matches to a white list prefix
    for (int i = 0; i < mWhiteListEntryArr.length; i++) {
      if (mWhiteListEntryArr[i].shouldBeUpdated()) {
        UrlMatcher matcher = mWhiteListEntryArr[i].getUrlMatcher();
        if (matcher.matches(url)) {
          // get the values for link extraction and indexing
          // from the current matcher hit
          urlMatchResult.setShouldBeParsed(matcher.getShouldBeParsed());
          urlMatchResult.setShouldBeIndexed(matcher.getShouldBeIndexed());
          mLog.debug("Whitelist matches for url: " + url);
          break;
        }
      }
    }
View Full Code Here

    if (url.startsWith("file://")) {
      // This is a file URL -> We have no information whether this file exists
      // since we didn't remember whether it was accepted or not.
     
      // Check whether the url is accepted by the white and black list
      UrlMatcher urlMatch = isUrlAccepted(url);
      if (! urlMatch.getShouldBeIndexed() ) {
        // This file is not accepted -> Remove it from the index
        return false;
      }
     
      // Check whether the file exists
View Full Code Here

TOP

Related Classes of net.sf.regain.crawler.config.UrlMatcher

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.