Examples of URLNormalizers


Examples of org.apache.nutch.net.URLNormalizers

    private long curTime;

    @Override
    protected void setup(Context context) throws IOException,
        InterruptedException {
      urlNormalizers = new URLNormalizers(context.getConfiguration(),
          URLNormalizers.SCOPE_INJECT);
      interval = context.getConfiguration().getInt("db.fetch.interval.default",
          2592000);
      filters = new URLFilters(context.getConfiguration());
      scfilters = new ScoringFilters(context.getConfiguration());
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    Configuration conf = context.getConfiguration();
    filters = new URLFilters(conf);
    curTime =
      conf.getLong(GeneratorJob.GENERATOR_CUR_TIME, System.currentTimeMillis());
    normalizers =
      new URLNormalizers(conf, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
    filter = conf.getBoolean(GeneratorJob.GENERATOR_FILTER, true);
    normalise = conf.getBoolean(GeneratorJob.GENERATOR_NORMALISE, true);
    schedule = FetchScheduleFactory.getFetchSchedule(conf);
    scoringFilters = new ScoringFilters(conf);
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

  public void configure(final JobConf job) {
    this.conf = job;
    this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
    this.collectionName = job.get(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY); // TODO MC
    this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_FETCHER); // TODO MC
    this.filters = new URLFilters(job); // TODO MC
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

  public void configure(final JobConf job) {
    this.conf = job;
    this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
    this.collectionName = job.get(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY); // TODO MC
    this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_FETCHER); // TODO MC
    this.filters = new URLFilters(job); // TODO MC
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    {
      // Extract collection prefix from key to use later when adding
      // signature and link crawldatums.

      this.urlNormalizers =
        new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
      this.filters = new URLFilters(job);
      this.scfilters = new ScoringFilters(job);

      final float interval =
        job.getFloat("db.default.fetch.interval", 30f);
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    }
   
    if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false))
    {
      this.nwUrlNormalizers =
        new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
    }
    
    this.collectionType = job.get(Global.DATABASE_CONNECTION);   
         LOG.debug("Collection type: " + collectionType + ", requested key: " + Global.COLLECTION_TYPE);
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {   
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

  public UrlNormalizer() {   
  }
 
  public String normalize(String url) throws IOException {
    if (urlNormalizers==null) {
      urlNormalizers=new URLNormalizers(getConf(),URLNormalizers.SCOPE_FETCHER);
    }
    return urlNormalizers.normalize(url,URLNormalizers.SCOPE_FETCHER);
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    this.indexRedirects = job.getBoolean("wax.index.redirects", false);

    this.sha1 = job.getBoolean("wax.digest.sha1", false);

    this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_FETCHER);
    this.filters = new URLFilters(job);

    this.parseUtil = new ParseUtil(job);

    this.collectionName = job.get(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY);
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    if (urlFiltering) {
      filters = new URLFilters(job);
    }
    if (urlNormalizers) {
      scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
      normalizers = new URLNormalizers(job, scope);
    }     
  }
View Full Code Here

Examples of org.apache.nutch.net.URLNormalizers

    ignoreInternalLinks = job.getBoolean("db.ignore.internal.links", true);
    if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
      urlFilters = new URLFilters(job);
    }
    if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
      urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
    }
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.