Package com.zhangwoo.spider.po

Examples of com.zhangwoo.spider.po.Conversation


    try {
      NodeList cons = (NodeList) xpath.evaluate(
          "//TABLE[@width='349' and @height='210']", docHtml,
          XPathConstants.NODESET);
      for (int i = 0; i < cons.getLength(); i++) {
        Conversation result = new Conversation();
        result.setContent(xpath.evaluate(
            "//TABLE[@width='349' and @height='210']",
                cons.item(i)));
        System.out.println(xpath.evaluate(
            "//TABLE[@width='349' and @height='210']",
                cons.item(i)));
View Full Code Here


      if (analyserNode != null) {
        String anSave = xpath.evaluate("@save", analyserNode);
        String anXpath = xpath.evaluate("@xpath", analyserNode);
        String anRegexp = xpath.evaluate("@regexp", analyserNode);

        Conversation convsXpath = getXpaths(analyserNode);
        Conversation convsRegExp = getRegExps(analyserNode);
        List<Conversation> convsResults = new ArrayList<Conversation>();

        // reg(可能json) 与 xpath(html标签) 走完全不同的路线
        if (!StringUtil.isEmpty(anXpath)
            && StringUtil.isEmpty(anRegexp)) { // 纯粹XPATH,给出结果一定是NodeList
          NodeList cons = (NodeList) xpath.evaluate(anXpath, docHtml,
              XPathConstants.NODESET);
          Conversation conTemp = new Conversation();
          for (int consi = 0; consi < cons.getLength(); consi++) {
            conTemp = analyserConversation(convsXpath, convsRegExp,
                cons.item(consi), null);
            conTemp.setSaveable(anSave);
            if(StringUtil.isEmpty(conTemp.getSelfLink())){
              conTemp.setSelfLink(urlReq.getUrl());
            }
            conTemp.setTid(urlReq.getTask().getTid());
            convsResults.add(conTemp);
          }
        } else if (!StringUtil.isEmpty(anRegexp)) {
          // List<String[]> cons=StringUtil.matchAll(anRegexp, html);
        }
View Full Code Here

  }
 

  private Conversation getRegExps(Node analyserNode)
      throws XPathExpressionException {
    Conversation convsRegExp = new Conversation();
    convsRegExp.setAuthor(xpath.evaluate("AUTHOR/@regexp", analyserNode));
    convsRegExp.setContent(xpath.evaluate("CONTENT/@regexp", analyserNode));
    convsRegExp.setMainLink(xpath
        .evaluate("MAINLINK/@regexp", analyserNode));
    convsRegExp.setPublishTime(xpath.evaluate("PUBLISHTIME/@regexp",
        analyserNode));
    convsRegExp.setSelfLink(xpath
        .evaluate("SELFLINK/@regexp", analyserNode));
    convsRegExp.setTitle(xpath.evaluate("TITLE/@regexp", analyserNode));
    convsRegExp.setUpdateTime(xpath.evaluate("UPDATETIME/@regexp",
        analyserNode));
    convsRegExp.setIsTopic(xpath.evaluate("ISTOPIC/@regexp", analyserNode));
    convsRegExp.setStopByExp(xpath.evaluate("SELFLINK/@stopByExp",
        analyserNode));
    return convsRegExp;
  }
View Full Code Here

    return convsRegExp;
  }

  private Conversation getXpaths(Node analyserNode)
      throws XPathExpressionException {
    Conversation convsXpath = new Conversation();
    convsXpath.setAuthor(xpath.evaluate("AUTHOR/@xpath", analyserNode).trim());
    convsXpath.setContent(xpath.evaluate("CONTENT/@xpath", analyserNode).trim());
    convsXpath.setMainLink(xpath.evaluate("MAINLINK/@xpath", analyserNode).trim());
    convsXpath.setPublishTime(xpath.evaluate("PUBLISHTIME/@xpath",
        analyserNode).trim());
    convsXpath.setSelfLink(xpath.evaluate("SELFLINK/@xpath", analyserNode).trim());
    convsXpath.setTitle(xpath.evaluate("TITLE/@xpath", analyserNode).trim());
    convsXpath.setUpdateTime(xpath.evaluate("UPDATETIME/@xpath",
        analyserNode).trim());
    convsXpath.setIsTopic(xpath.evaluate("ISTOPIC/@xpath", analyserNode).trim());
    convsXpath.setStopByXpath(xpath.evaluate("SELFLINK/@stopByXpath",
        analyserNode).trim());
    convsXpath
        .setRunable(xpath.evaluate("SELFLINK/@runable", analyserNode).trim());
    return convsXpath;
  }
View Full Code Here

   * @throws XPathExpressionException
   */
  private Conversation analyserConversation(Conversation convsXpath,
      Conversation convsRegExp, Node node, String str)
      throws XPathExpressionException {
    Conversation resutl = new Conversation();

    if (node == null) {

    } else if (StringUtil.isEmpty(str)) {
      if (!StringUtil.isEmpty(convsXpath.getAuthor()))
        resutl.setAuthor(xpath.evaluate(convsXpath.getAuthor(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getContent()))
        resutl.setContent(xpath.evaluate(convsXpath.getContent(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getIsTopic()))
        resutl.setIsTopic(xpath.evaluate(convsXpath.getIsTopic(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getMainLink()))
        resutl.setMainLink(xpath.evaluate(convsXpath.getMainLink(),
            node).trim());
      if (!StringUtil.isEmpty(convsXpath.getPublishTime()))
        resutl.setPublishTime(xpath.evaluate(
            convsXpath.getPublishTime(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getRunable()))
        resutl.setRunable(xpath.evaluate(convsXpath.getRunable(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getSelfLink()))
        resutl.setSelfLink(SpiderThread.formatUrl(xpath.evaluate(convsXpath.getSelfLink(),
            node).trim(),this.urlReq));
      if (!StringUtil.isEmpty(convsXpath.getTitle()))
        resutl.setTitle(xpath.evaluate(convsXpath.getTitle(), node).trim());
      if (!StringUtil.isEmpty(convsXpath.getUpdateTime()))
        resutl.setUpdateTime(xpath.evaluate(convsXpath.getUpdateTime(),
            node).trim());
      if (!StringUtil.isEmpty(convsXpath.getRunable()))
        resutl.setRunable(xpath.evaluate(convsXpath.getRunable(), node).trim());
    }

    return resutl;
  }
View Full Code Here

      Document docHtml) {
    List<Conversation> convsResults = new ArrayList<Conversation>();
   
    String[] tmps=StringUtil.match(html, "<a class=\"downlink\" href=\"([^\"]+)\"></a>");
    if(tmps!=null){
      Conversation c=new Conversation();
      c.setSelfLink(StringUtil.match(html, "<a class=\"downlink\" href=\"([^\"]+)\"></a>")[1]);
      tmps=StringUtil.match(html, "<a href=\"/all/\\d+.htm\">(.*?\\-.*?)</a>");
      if(tmps!=null){
        c.setAuthor(tmps[1].split("-")[0]);
        c.setTitle(tmps[1].split("-")[1]);
        convsResults.add(c);
      }
    }
   
   
 
View Full Code Here

      Document docHtml) {
    List<Conversation> convsResults = new ArrayList<Conversation>();
    try {
      NodeList cons = (NodeList) xpath.evaluate("//DIV[@class='text clearfix']", docHtml,XPathConstants.NODESET);
      for (int i = 0; i < cons.getLength(); i++) {
        Conversation result = new Conversation();
        result.setTitle(urlReq.getTitle());
        result.setContent(xpath.evaluate("DIV[@class='title_top']/text()", cons.item(i))+xpath.evaluate("DIV[@class='s_cont']/text()", cons.item(i)));
        result.setSelfLink(xpath.evaluate("DIV[@class='title_top']/A/@href", cons.item(i)));
        result.setPublishTime(StringUtil.match(xpath.evaluate("DIV[@class='title clearfix']/SPAN[@class='level']/text()", cons.item(i)),"\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}")[0]);
        convsResults.add(result);
      }
    } catch (XPathExpressionException e) {
      logger.error("findConversations error",e);
    }
View Full Code Here

TOP

Related Classes of com.zhangwoo.spider.po.Conversation

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.