Examples of DomainSuffix


Examples of org.apache.nutch.util.domain.DomainSuffix

  public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

    try {
      URL url = new URL(urlText.toString());
      DomainSuffix d = URLUtil.getDomainSuffix(url);
     
      doc.add("tld", d.getDomain());
     
    }catch (Exception ex) {
      LOG.warn(ex);
    }
View Full Code Here

Examples of org.apache.nutch.util.domain.DomainSuffix

    int index = 0;
    String candidate = host;
    for(;index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index+1);
      DomainSuffix d = tlds.get(subCandidate);
      if(d != null) {
        return d;
      }
      candidate = subCandidate;
    }
View Full Code Here

Examples of org.apache.nutch.util.domain.DomainSuffix

    int index = 0;
    String candidate = host;
    for(;index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index+1);
      DomainSuffix d = tlds.get(subCandidate);
      if(d != null) {
        return d;
      }
      candidate = subCandidate;
    }
View Full Code Here

Examples of org.apache.nutch.util.domain.DomainSuffix

    List<String> tlds = doc.getFieldValues("tld");
    float boost = 1.0f;

    if(tlds != null) {
      for(String tld : tlds) {
        DomainSuffix entry = tldEntries.get(tld);
        if(entry != null)
          boost *= entry.getBoost();
      }
    }
    return initScore * boost;
  }
View Full Code Here

Examples of org.apache.nutch.util.domain.DomainSuffix

      // match for suffix, domain, and host in that order.  more general will
      // override more specific
      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
      String host = URLUtil.getHost(url);
      String suffix = null;
      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
      if (domainSuffix != null) {
        suffix = domainSuffix.getDomain();
      }
     
      if (domainSet.contains(suffix) || domainSet.contains(domain)
        || domainSet.contains(host)) {
        return url;
View Full Code Here

Examples of org.apache.nutch.util.domain.DomainSuffix

    NutchField tlds = doc.getField("tld");
    float boost = 1.0f;

    if(tlds != null) {
      for(Object tld : tlds.getValues()) {
        DomainSuffix entry = tldEntries.get(tld.toString());
        if(entry != null)
          boost *= entry.getBoost();
      }
    }
    return initScore * boost;
  }
View Full Code Here

Examples of org.apache.nutch.util.domain.DomainSuffix

  public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

    try {
      URL url = new URL(urlText.toString());
      DomainSuffix d = URLUtil.getDomainSuffix(url);
     
      doc.add("tld", d.getDomain());
     
    }catch (Exception ex) {
      LOG.warn(ex);
    }
View Full Code Here

Examples of org.apache.nutch.util.domain.DomainSuffix

      // match for suffix, domain, and host in that order.  more general will
      // override more specific
      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
      String host = URLUtil.getHost(url);
      String suffix = null;
      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
      if (domainSuffix != null) {
        suffix = domainSuffix.getDomain();
      }
     
      if (domainSet.contains(suffix) || domainSet.contains(domain)
        || domainSet.contains(host)) {
        return url;
View Full Code Here

Examples of org.apache.nutch.util.domain.DomainSuffix

    int index = 0;
    String candidate = host;
    for(;index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index+1);
      DomainSuffix d = tlds.get(subCandidate);
      if(d != null) {
        return d;
      }
      candidate = subCandidate;
    }
View Full Code Here

Examples of org.apache.nutch.util.domain.DomainSuffix

      // match for suffix, domain, and host in that order.  more general will
      // override more specific
      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
      String host = URLUtil.getHost(url);
      String suffix = null;
      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
      if (domainSuffix != null) {
        suffix = domainSuffix.getDomain();
      }
     
      if (domainSet.contains(suffix) || domainSet.contains(domain)
        || domainSet.contains(host)) {
        return url;
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.