Package fr.eolya.crawler.connectors.web

Source Code of fr.eolya.crawler.connectors.web.SourceWeb

package fr.eolya.crawler.connectors.web;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;

import fr.eolya.crawler.connectors.ISource;
import fr.eolya.crawler.connectors.Source;
import fr.eolya.crawler.connectors.web.StartingUrls;


public class SourceWeb extends Source implements ISource {
 
  protected StartingUrls startingUrls = null;

  public SourceWeb (int id, String className, String crawlMode, Map<String,Object> srcData) throws IOException {
    super(id, className, crawlMode, srcData);
  }

  public String getHost() {
    return getSrcDataString("url_host");
  }

  public String getUserAgent() {
    return getSrcDataString("user_agent");
  }

  public boolean isOptimized() {
    StartingUrls startingUrls = getStartingUrls();
    if (startingUrls==null) return false;
    return startingUrls.isOptimized();
  }
 
  public String getUrlConcurrency() {
    if (!"0".equals(getUrlPerMinute()))
      return String.valueOf(Math.min(16, Integer.parseInt(getSrcDataString("crawl_url_concurrency", "1"))));
    return getSrcDataString("crawl_url_concurrency", "1");
  }

  public StartingUrls getStartingUrls() {
    if (startingUrls==null) {
      try {
        startingUrls = new StartingUrls(getSrcDataStringAsXml("url"));
      } catch (Exception e) {
        e.printStackTrace();
        return null;
      }
    }
    return startingUrls;
  }

  public String getUrlIgnoreFields(){
    // fields which ARE session id
    return getSrcDataString("url_ignore_fields");
  }
 
  public String getUrlIgnoreFieldsNoSessionId(){
    // fields which ARE NOT session id
    return getSrcDataString("url_ignore_fields_no_session_id");
  }

  /**
   * Provide the strategy for urls with both http and https protocol
   * @return 0 means we want both urls, 1 means we want only the http url, 2 means we want only the https url
   */
  public int getProtocolStrategy(){
    String value = getSrcDataString("protocol_strategy");
    if (value==null || "".equals(value)) return 0;
    return Integer.parseInt(value);
  }

    public int getCheckDeletedStrategy(){
        String value = getSrcDataString("checkdeleted_strategy");
        if (value==null || "".equals(value)) return 0;
        return Integer.parseInt(value);
    }

    public List<String> getHostAliases() {
      List<String> aliases = null;
      if (StringUtils.trimToNull(getSrcDataString("alias_host"))!=null)
        aliases = new ArrayList<String>(Arrays.asList(getSrcDataString("alias_host").split(",")));
      if (aliases==null) return Arrays.asList(getHost().split(","));
      aliases.add(getHost());
      return aliases;
  }

  public String getAutomaticCleaning(){
    return getSrcDataString("automatic_cleaning");
  }

  public String getDescription() {
    return getHost() + "(" + getId() + ")";
  }
}
TOP

Related Classes of fr.eolya.crawler.connectors.web.SourceWeb

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.