Examples of net.sf.regain.crawler.config.UrlMatcher

net.sf.regain.crawler.config.UrlMatcher
A matcher that checks URLs whether they match to a certain pattern. @author Tilman Schneider, STZ-IDA an der FH Karlsruhe


    if ((! alreadyAccepted) && (! alreadyIgnored)) {
      // Check whether the url matches an entry in the whitelist and not an entry in the blacklist
      // We assume that the caller of addJob() detected the correct values for shouldBeParsed
      // and shouldBeIndexed. 
      UrlMatcher urlMatch = mUrlChecker.isUrlAccepted(url);
      boolean accepted;
      if( urlMatch.getShouldBeParsed() || urlMatch.getShouldBeIndexed() )
        accepted = true;
      else
        accepted = false;
      
      int mMaxCycleCount = mConfiguration.getMaxCycleCount();

View Full Code Here

    if( rawDocument.hasLinks() ){
      // Iterate over all found links in the document
      for (Iterator iter = rawDocument.getLinks().entrySet().iterator(); iter.hasNext();){ 
        Map.Entry entry = (Map.Entry)iter.next();
        // The intention of this call is only to determine the link-extraction and indexing property
        UrlMatcher urlMatch = mUrlChecker.isUrlAccepted((String)entry.getKey());
        // Add the job
        addJob((String)entry.getKey(), rawDocument.getUrl(), 
          urlMatch.getShouldBeParsed(), urlMatch.getShouldBeIndexed(), (String)entry.getValue());
      }
    }
  }

View Full Code Here

   * @param url Die zu prüfende URL.
   * @return Ob die URL von der Schwarzen und Weißen Liste akzeptiert wird.
   */
  public UrlMatcher isUrlAccepted(String url) {
    
    UrlMatcher urlMatchResult = new UrlMatcherResult(false, false);
    mLog.debug("isUrlAccepted for url: " + url);
    // check whether this URL matches to a white list prefix
    for (int i = 0; i < mWhiteListEntryArr.length; i++) {
      if (mWhiteListEntryArr[i].shouldBeUpdated()) {
        UrlMatcher matcher = mWhiteListEntryArr[i].getUrlMatcher();
        if (matcher.matches(url)) {
          // get the values for link extraction and indexing 
          // from the current matcher hit
          urlMatchResult.setShouldBeParsed(matcher.getShouldBeParsed());
          urlMatchResult.setShouldBeIndexed(matcher.getShouldBeIndexed());
          mLog.debug("Whitelist matches for url: " + url);
          break;
        }
      }
    }

View Full Code Here

    if (url.startsWith("file://")) {
      // This is a file URL -> We have no information whether this file exists
      // since we didn't remember whether it was accepted or not.
      
      // Check whether the url is accepted by the white and black list
      UrlMatcher urlMatch = isUrlAccepted(url);
      if (! urlMatch.getShouldBeIndexed() ) {
        // This file is not accepted -> Remove it from the index
        return false;
      }
      
      // Check whether the file exists

View Full Code Here

TOP

Related Classes of net.sf.regain.crawler.config.UrlMatcher

net.sf.regain.crawler.Crawler

net.sf.regain.crawler.UrlChecker

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.