Examples of URLNormalizers


Examples of org.apache.nutch.net.URLNormalizers

      this.conf = conf;
      ignoreHost = conf.getBoolean("link.ignore.internal.host", true);
      ignoreDomain = conf.getBoolean("link.ignore.internal.domain", true);
      limitPages = conf.getBoolean("link.ignore.limit.page", true);
      limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
      urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
    }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    super.setConf(conf);
    if (conf == null) return;
    if (conf.getBoolean("segment.merger.filter", false))
      filters = new URLFilters(conf);
    if (conf.getBoolean("segment.merger.normalizer", false))
      normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
    sliceSize = conf.getLong("segment.merger.slice", -1);
    if ((sliceSize > 0) && (LOG.isInfoEnabled())) {
      LOG.info("Slice size: " + sliceSize + " URLs.");
    }
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    ignoreInternalLinks = job.getBoolean("db.ignore.internal.links", true);
    if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
      urlFilters = new URLFilters(job);
    }
    if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
      urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
    }
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    // normalizers
    this.jobConf = job;
    this.urlFilters = new URLFilters(jobConf);
    this.scfilters = new ScoringFilters(jobConf);
    this.parseUtil = new ParseUtil(jobConf);
    this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_FETCHER);
    interval = jobConf.getInt("db.fetch.interval.default", 2592000);
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

        byDomain = false;
      }
      if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain = true;
      filters = new URLFilters(job);
      normalise = job.getBoolean(GENERATOR_NORMALISE, true);
      if (normalise) normalizers = new URLNormalizers(job,
          URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
      scfilters = new ScoringFilters(job);
      partitioner.configure(job);
      filter = job.getBoolean(GENERATOR_FILTER, true);
      genDelay = job.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L;
 
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

      normalize = conf.getBoolean(URL_NORMALIZING, false);
      filter = conf.getBoolean(URL_FILTERING, false);

      if (normalize) {
        urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
      }

      if (filter) {
        filters = new URLFilters(conf);
      }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

        byDomain = false;
      }
      if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain = true;
      filters = new URLFilters(job);
      normalise = job.getBoolean(GENERATOR_NORMALISE, true);
      if (normalise) normalizers = new URLNormalizers(job,
          URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
      scfilters = new ScoringFilters(job);
      partitioner.configure(job);
      filter = job.getBoolean(GENERATOR_FILTER, true);
      genDelay = job.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L;
 
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

      this.setName("FetcherThread" + num);        // use an informative name
      this.context = context;
      Configuration conf = context.getConfiguration();
      this.urlFilters = new URLFilters(conf);
      this.protocolFactory = new ProtocolFactory(conf);
      this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
      this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
      // backward-compatible default setting
      this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
      this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
    }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    this.conf = conf;
    parserFactory = new ParserFactory(conf);
    maxParseTime=conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
    sig = SignatureFactory.getSignature(conf);
    filters = new URLFilters(conf);
    normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
    int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
    maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
    ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
    executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder()
      .setNameFormat("parse-%d").setDaemon(true).build());
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    private long curTime;

    @Override
    protected void setup(Context context) throws IOException,
        InterruptedException {
      urlNormalizers = new URLNormalizers(context.getConfiguration(),
          URLNormalizers.SCOPE_INJECT);
      interval = context.getConfiguration().getInt("db.fetch.interval.default",
          2592000);
      filters = new URLFilters(context.getConfiguration());
      scfilters = new ScoringFilters(context.getConfiguration());
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.