Examples of URLNormalizers


Examples of org.apache.nutch.net.URLNormalizers

    if (filter) {
      filters = new URLFilters(job);
    }
    if (normalize) {
      scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_LINKDB);
      normalizers = new URLNormalizers(job, scope);
    }
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    if (urlFiltering) {
      filters = new URLFilters(job);
    }
    if (urlNormalizers) {
      scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
      normalizers = new URLNormalizers(job, scope);
    }
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

  private int seed;
  private URLNormalizers normalizers;

  public void configure(JobConf job) {
    seed = job.getInt("partition.url.by.host.seed", 0);
    normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION);
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    ignoreInternalLinks = job.getBoolean("db.ignore.internal.links", true);
    if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
      urlFilters = new URLFilters(job);
    }
    if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
      urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
    }
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

      scfilters = new ScoringFilters(job);
      if (job.getBoolean(FILTER_KEY, false)) {
        filters = new URLFilters(job);
      }
      if (job.getBoolean(NORMALIZE_KEY, false)) {
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
      }
    }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    super.setConf(conf);
    if (conf == null) return;
    if (conf.getBoolean("segment.merger.filter", false))
      filters = new URLFilters(conf);
    if (conf.getBoolean("segment.merger.normalizer", false))
      normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
    sliceSize = conf.getLong("segment.merger.slice", -1);
    if ((sliceSize > 0) && (LOG.isInfoEnabled())) {
      LOG.info("Slice size: " + sliceSize + " URLs.");
    }
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

      curTime = job.getLong(CRAWL_GEN_CUR_TIME, System.currentTimeMillis());
      limit = job.getLong(CRAWL_TOP_N,Long.MAX_VALUE)/job.getNumReduceTasks();
      maxPerHost = job.getInt(GENERATE_MAX_PER_HOST, -1);
      byIP = job.getBoolean(GENERATE_MAX_PER_HOST_BY_IP, false);
      filters = new URLFilters(job);
      normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
      scfilters = new ScoringFilters(job);
      hostPartitioner.configure(job);
      filter = job.getBoolean(CRAWL_GENERATE_FILTER, true);
      genDelay = job.getLong(CRAWL_GEN_DELAY, 7L) * 3600L * 24L * 1000L;
      long time = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

  private int seed;
  private URLNormalizers normalizers;

  public void configure(JobConf job) {
    seed = job.getInt("partition.url.by.host.seed", 0);
    normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION);
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    normalize = job.getBoolean(URL_NORMALIZING, false);
    filter = job.getBoolean(URL_FILTERING, false);

    if (normalize) {
      urlNormalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_INDEXER);
    }

    if (filter) {
      urlFilters = new URLFilters(getConf());
    }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

      scfilters = new ScoringFilters(job);
      if (job.getBoolean(FILTER_KEY, false)) {
        filters = new URLFilters(job);
      }
      if (job.getBoolean(NORMALIZE_KEY, false)) {
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
      }
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.