Package us.codecraft.webmagic.selector

Examples of us.codecraft.webmagic.selector.Html


    }

    @Test
    public void testDownloader() {
        HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
        Html html = httpClientDownloader.download("https://github.com");
        assertTrue(!html.getFirstSourceText().isEmpty());
    }
View Full Code Here


*/
public class HtmlTest {

    @Test
    public void testRegexSelector() {
        Html selectable = new Html("aaaaaaab");
    assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab");
    }
View Full Code Here

    }

  @Test
  public void testDisableJsoupHtmlEntityEscape() throws Exception {
    Html.DISABLE_HTML_ENTITY_ESCAPE = true;
    Html html = new Html("aaaaaaa&b");
    assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
  }
View Full Code Here

  }

  @Test
  public void testEnableJsoupHtmlEntityEscape() throws Exception {
    Html.DISABLE_HTML_ENTITY_ESCAPE = false;
    Html html = new Html("aaaaaaa&b");
    assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
  }
View Full Code Here

     *
     * @return html
     */
    public Html getHtml() {
        if (html == null) {
            html = new Html(UrlUtils.fixAllRelativeHrefs(rawText, request.getUrl()));
        }
        return html;
    }
View Full Code Here

        }
        WebElement webElement = webDriver.findElement(By.xpath("/html"));
        String content = webElement.getAttribute("outerHTML");
        Page page = new Page();
        page.setRawText(content);
        page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
        page.setUrl(new PlainText(request.getUrl()));
        page.setRequest(request);
        webDriverPool.returnToPool(webDriver);
        return page;
    }
View Full Code Here

* @author code4crafer@gmail.com
*/
public class AmanzonPageProcessor implements PageProcessor{
    public void process(Page page) {

        Html html = page.getHtml();
        List<String> questionList =  html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();

        if(questionList != null && questionList.size() > 1)
        {
            //i=0是列名称,所以i从1开始
            for( int i = 1 ; i < questionList.size(); i++)
            {
                System.out.println(questionList.get(i));
                Html tempHtml =  Html.create("<table>"+questionList.get(i)+"</table>");
                String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
                System.out.println(comment);
                String answerNum =  tempHtml.xpath("//td[@class='num']/text()").toString();
                System.out.println(answerNum);
                String createTime = tempHtml.xpath("//td[3]/text()").toString();
                System.out.println(createTime);

        /* Document doc = Jsoup.parse(questionList.get(i));
         Html hmt  = Html.create(questionList.get(i)) ;
           String str = hmt.links().toString();
View Full Code Here

    public void test() {
        ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class);
        Page page = new Page();
        page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
        page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
        page.setHtml(new Html(html));
        long time = System.currentTimeMillis();
        for (int i = 0; i < 1000; i++) {
            modelPageProcessor.process(page);
        }
        System.out.println(System.currentTimeMillis() - time);
View Full Code Here

            "</html>\n" +
            "\n";
    @Override
    public Page download(Request request, Task task) {
        Page page = new Page();
        page.setHtml(new Html(html));
        page.setRequest(new Request("https://github.com/code4craft/webmagic"));
        page.setUrl(new PlainText("https://github.com/code4craft/webmagic"));
        return page;
    }
View Full Code Here

TOP

Related Classes of us.codecraft.webmagic.selector.Html

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.