Package com.iqbon.spider.domain

Examples of com.iqbon.spider.domain.Source


    sourceDao.insertOne(source);
  }

  @Test
  public void testQuerySourceByUrl() {
    Source s = sourceDao.querySourceByUrl(source.getUrl());
    logger.info(ToStringBuilder.reflectionToString(s));

  }
View Full Code Here


    }
  }

  @Test
  public void testUpdateSource() {
    Source s = sourceDao.querySourceByUrl(source.getUrl());
    SourceMatcher matcher = new SourceMatcher();
    matcher.setLinkParent("li[class=list-right]");
    matcher.setDescription("�Ҳ�����");
    Replace replace = new Replace();
    replace.setMatcher("<div class=\"gg200x300\">[\\s\\S]*?</div>");
    replace.setReplacement("");
    Replace replace2 = new Replace();
    replace2.setMatcher("<a [\\s\\S]*?>");
    replace2.setReplacement("");
    Replace replace3 = new Replace();
    replace3.setMatcher("</a>");
    replace3.setReplacement("");
    List<Replace> replaces = new ArrayList<Replace>();
    List<SourceMatcher> matchers = new ArrayList<SourceMatcher>();
    matchers.add(matcher);
    replaces.add(replace);
    replaces.add(replace2);
    replaces.add(replace3);
    s.setContentMatcher("#endText");
    s.setMatchers(matchers);
    s.setReplaces(replaces);
    s.setDescription("after test");
    sourceDao.updateSource(s);
  }
View Full Code Here

   * @param url
   * @param sourceUrl
   * @return
   */
  public Record getCrawlContentBySourceAndUrl(String url, String sourceUrl) {
    Source source = sourceDao.querySourceByUrl(sourceUrl);
    return crawlService.getContentFromLink(url, source);
  }
View Full Code Here

      return "输入的URL不合法";
    }
    if (StringUtils.isEmpty(contentMatcher)) {
      return "内容匹配规则不能为空";
    }
    Source duplicate = sourceDao.querySourceByUrl(url);
    if (duplicate != null) {
      return "已经存在重复的采集数据源" + url;
    }
    Source source = new Source();
    source.setUrl(url);
    source.setContentMatcher(contentMatcher);
    source.setDescription(description);
    source.setMatchers(matchers);
    sourceDao.insertOne(source);
    return null;
  }
View Full Code Here

   * 根据url获取该抓取数据源所能匹配的链接
   * @param sourceId
   * @return
   */
  public List<String> getCrawlRecordBySourceId(String url) {
    Source source = sourceDao.querySourceByUrl(url);
    return crawlService.getLinkFromSource(source);
  }
View Full Code Here

    fail("Not yet implemented");
  }

  @Test
  public void testGetContentFromLink() {
    Source source = sourceDao.querySourceByUrl(URL);
    Record record = crawlService.getContentFromLink(
        "http://home.163.com/13/0123/18/8LU3QD5700104JVC.html", source);
    logger.info(ToStringBuilder.reflectionToString(record));
  }
View Full Code Here

TOP

Related Classes of com.iqbon.spider.domain.Source

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.