Package org.apache.nutch.net

Examples of org.apache.nutch.net.URLNormalizers


        byDomain = false;
      }
      if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain = true;
      filters = new URLFilters(job);
      normalise = job.getBoolean(GENERATOR_NORMALISE, true);
      if (normalise) normalizers = new URLNormalizers(job,
          URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
      scfilters = new ScoringFilters(job);
      partitioner.configure(job);
      filter = job.getBoolean(GENERATOR_FILTER, true);
      genDelay = job.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L;
 
View Full Code Here


    if (!mode.equals(PARTITION_MODE_IP) && !mode.equals(PARTITION_MODE_DOMAIN)
        && !mode.equals(PARTITION_MODE_HOST)) {
      LOG.error("Unknown partition mode : " + mode + " - forcing to byHost");
      mode = PARTITION_MODE_HOST;
    }
    normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION);
  }
View Full Code Here

      this.conf = conf;
      ignoreHost = conf.getBoolean("link.ignore.internal.host", true);
      ignoreDomain = conf.getBoolean("link.ignore.internal.domain", true);
      limitPages = conf.getBoolean("link.ignore.limit.page", true);
      limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
      urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
    }
View Full Code Here

    // normalizers
    this.jobConf = job;
    this.urlFilters = new URLFilters(jobConf);
    this.scfilters = new ScoringFilters(jobConf);
    this.parseUtil = new ParseUtil(jobConf);
    this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_FETCHER);
    interval = jobConf.getInt("db.fetch.interval.default", 2592000);
  }
View Full Code Here

      scfilters = new ScoringFilters(job);
      if (job.getBoolean(FILTER_KEY, false)) {
        filters = new URLFilters(job);
      }
      if (job.getBoolean(NORMALIZE_KEY, false)) {
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
      }
    }
View Full Code Here

    // normalizers
    this.jobConf = job;
    this.urlFilters = new URLFilters(jobConf);
    this.scfilters = new ScoringFilters(jobConf);
    this.parseUtil = new ParseUtil(jobConf);
    this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_FETCHER);
    interval = jobConf.getInt("db.fetch.interval.default", 2592000);
  }
View Full Code Here

    if (!mode.equals(PARTITION_MODE_IP) && !mode.equals(PARTITION_MODE_DOMAIN)
        && !mode.equals(PARTITION_MODE_HOST)) {
      LOG.error("Unknown partition mode : " + mode + " - forcing to byHost");
      mode = PARTITION_MODE_HOST;
    }
    normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION);
  }
View Full Code Here

      scfilters = new ScoringFilters(job);
      if (job.getBoolean(FILTER_KEY, false)) {
        filters = new URLFilters(job);
      }
      if (job.getBoolean(NORMALIZE_KEY, false)) {
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
      }
    }
View Full Code Here

    if (filter) {
      filters = new URLFilters(job);
    }
    if (normalize) {
      scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_LINKDB);
      normalizers = new URLNormalizers(job, scope);
    }
  }
View Full Code Here

    if (urlFiltering) {
      filters = new URLFilters(job);
    }
    if (urlNormalizers) {
      scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
      normalizers = new URLNormalizers(job, scope);
    }
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.net.URLNormalizers

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.