Examples of FetchResult


Examples of org.eweb4j.spiderman.fetcher.FetchResult

    //解析nextPage,找出里面的目标URL
    Task nextTask = new Task(nextUrl, task.url, task.site, 0);
   
    FetchRequest req = new FetchRequest();
    req.setUrl(nextUrl);
    FetchResult fr = task.site.fetcher.fetch(req);
    if (fr == null || fr.getPage() == null)
      return ;
   
    //记录已经访问过该url,下次不要重复访问它
    visitedUrls.add(nextUrl);
    Page nextPageResult = fr.getPage();
    if (nextPageResult.getContent() == null || nextPageResult.getContent().trim().length() == 0)
      return;
   
    //暂时使用默认的发现新URL的逻辑
    Collection<String> _urls = Util.findAllLinkHref(nextPageResult.getContent(), task.url);
View Full Code Here

Examples of org.eweb4j.spiderman.fetcher.FetchResult

   * @date 2013-1-7 上午11:08:54
   * @param toFetchURL
   * @return
   */
  public FetchResult fetch(FetchRequest req) throws Exception{
    FetchResult fetchResult = new FetchResult();
    HttpGet get = null;
    HttpEntity entity = null;
    String toFetchURL = req.getUrl();
    try {
      get = new HttpGet(toFetchURL);
      //设置请求GZIP压缩,注意,前面必须设置GZIP解压缩处理
      get.addHeader("Accept-Encoding", "gzip");
      for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext();){
        Entry<String, String> entry = it.next();
        get.setHeader(entry.getKey(), entry.getValue());
      }
     
      //同步信号量,在真正对服务端进行访问之前进行访问间隔的控制
      // TODO 针对每个请求有一个delay的参数设置
      synchronized (mutex) {
        //获取当前时间
        long now = (new Date()).getTime();
        //对同一个Host抓取时间间隔进行控制,若在设置的时限内则进行休眠
        if (now - lastFetchTime < config.getPolitenessDelay())
          Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
        //不断更新最后的抓取时间,注意,是针对HOST的,不是针对某个URL的
        lastFetchTime = (new Date()).getTime();
      }
     
      //记录get请求信息
      Header[] headers = get.getAllHeaders();
      for (Header h : headers){
        Map<String, List<String>> hs = req.getHeaders();
        String key = h.getName();
        List<String> val = hs.get(key);
        if (val == null)
          val = new ArrayList<String>();
        val.add(h.getValue());
       
        hs.put(key, val);
      }
      fetchResult.setReq(req);
      //执行get访问,获取服务端返回内容
      HttpResponse response = httpClient.execute(get);
      headers = response.getAllHeaders();
      for (Header h : headers){
        Map<String, List<String>> hs = fetchResult.getHeaders();
        String key = h.getName();
        List<String> val = hs.get(key);
        if (val == null)
          val = new ArrayList<String>();
        val.add(h.getValue());
       
        hs.put(key, val);
      }
      //设置已访问URL
      fetchResult.setFetchedUrl(toFetchURL);
      String uri = get.getURI().toString();
      if (!uri.equals(toFetchURL))
        if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL))
          fetchResult.setFetchedUrl(uri);
     
      entity = response.getEntity();
      //服务端返回的状态码
      int statusCode = response.getStatusLine().getStatusCode();
      if (statusCode != HttpStatus.SC_OK) {
        if (statusCode != HttpStatus.SC_NOT_FOUND) {
          Header locationHeader = response.getFirstHeader("Location");
          //如果是301、302跳转,获取跳转URL即可返回
          if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY))
            fetchResult.setMovedToUrl(URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL));
        }
        //只要不是OK的除了设置跳转URL外设置statusCode即可返回
        //判断是否有忽略状态码的设置
        if (this.site.getSkipStatusCode() != null && this.site.getSkipStatusCode().trim().length() > 0){
          String[] scs = this.site.getSkipStatusCode().split(",");
          for (String code : scs){
            int c = CommonUtil.toInt(code);
            //忽略此状态码,依然解析entity
            if (statusCode == c){
              assemPage(fetchResult, entity);
              break;
            }
          }
        }
        fetchResult.setStatusCode(statusCode);
        return fetchResult;
      }

      //处理服务端返回的实体内容
      if (entity != null) {
        fetchResult.setStatusCode(statusCode);
        assemPage(fetchResult, entity);
        return fetchResult;
      }
    } catch (Throwable e) {
      e.printStackTrace();
      fetchResult.setFetchedUrl(e.toString());
      fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal());
      return fetchResult;
    } finally {
      try {
        if (entity == null && get != null)
          get.abort();
      } catch (Exception e) {
        throw e;
      }
    }
   
    fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal());
    return fetchResult;
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.