Package org.eweb4j.spiderman.fetcher

Examples of org.eweb4j.spiderman.fetcher.Page


   
    nextModel.getField().addAll(isAlsoParseInNextPageFields);
    tgt.setModel(nextModel);
   
    ModelParser parser = new ModelParser(nextTask, tgt, listener);
    Page nextPageResult = fr.getPage();
    List<Map<String, Object>> nextMaps = parser.parse(nextPageResult);
    if (nextMaps == null)
      return ;
   
    for (Map<String, Object> nextMap : nextMaps){
View Full Code Here


    if (fr == null || fr.getPage() == null)
      return ;
   
    //记录已经访问过该url,下次不要重复访问它
    visitedUrls.add(nextUrl);
    Page nextPageResult = fr.getPage();
    if (nextPageResult.getContent() == null || nextPageResult.getContent().trim().length() == 0)
      return;
   
    //暂时使用默认的发现新URL的逻辑
    Collection<String> _urls = Util.findAllLinkHref(nextPageResult.getContent(), task.url);
//    System.out.println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!-------- newUrls-------->" + _urls + ", from->"+nextUrl);
    urls.addAll(_urls);
   
    //递归
    parseNextPage(rule, nextTask, nextPageResult, urls, visitedUrls, finalFields);
View Full Code Here

    return fetchResult;
  }

  private void assemPage(FetchResult fetchResult, HttpEntity entity)
      throws Exception {
    Page page = load(entity);
    page.setUrl(fetchResult.getFetchedUrl());
    fetchResult.setPage(page);
  }
View Full Code Here

   * @param entity
   * @return
   * @throws Exception
   */
  private Page load(HttpEntity entity) throws Exception {
    Page page = new Page();
   
    //设置返回内容的ContentType
    String contentType = null;
    Header type = entity.getContentType();
    if (type != null)
      contentType = type.getValue();
    page.setContentType(contentType);
   
    //设置返回内容的字符编码
    String contentEncoding = null;
    Header encoding = entity.getContentEncoding();
    if (encoding != null)
      contentEncoding = encoding.getValue();
    page.setEncoding(contentEncoding);
   
    //设置返回内容的字符集
    String contentCharset = EntityUtils.getContentCharSet(entity);
    page.setCharset(contentCharset);
   
    //根据配置文件设置的字符集参数进行内容二进制话
    String charset = config.getCharset();
    String content = this.read(entity.getContent(), charset);
    page.setContent(content);
    if (charset == null || charset.trim().length() == 0)
      page.setContentData(content.getBytes());
    else
      page.setContentData(content.getBytes(charset));
   
    return page;
  }
View Full Code Here

TOP

Related Classes of org.eweb4j.spiderman.fetcher.Page

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.