Package edu.uci.ics.crawler4j.crawler

Examples of edu.uci.ics.crawler4j.crawler.Page


  private HTMLParser htmlParser = new HTMLParser();

  public Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    Page page = new Page(curURL);
    int statusCode = PageFetcher.fetch(page, true);
    if (statusCode == PageFetchStatus.OK) {
      try {
        htmlParser.parse(page.getHTML(), curURL.getURL());
        page.setText(htmlParser.getText());
        page.setTitle(htmlParser.getTitle());
        return page;
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
View Full Code Here


    return null;
  }
 
  public static void main(String[] args) {
    Downloader myDownloader = new Downloader();
    Page page = myDownloader.download("http://ics.uci.edu");
    if (page != null) {
      System.out.println(page.getText());
    }
  }
View Full Code Here

  }
 
  private static HostDirectives fetchDirectives(String host) {
    WebURL robotsTxt = new WebURL();
    robotsTxt.setURL("http://" + host + "/robots.txt");
    Page page = new Page(robotsTxt);
    int statusCode = PageFetcher.fetch(page, true);
    HostDirectives directives = null;
    if (statusCode == PageFetchStatus.OK) {
      directives = RobotstxtParser.parse(page.getHTML(), USER_AGENT_NAME);     
    }
    if (directives == null) {
      // We still need to have this object to keep track of the time we fetched it
      directives = new HostDirectives();
    }
View Full Code Here

    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(curURL);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        try {
          Page page = new Page(curURL);
          fetchResult.fetchContent(page);
          if (parser.parse(page, curURL.getURL())) {
            return page;
          }
        } catch (Exception e) {
View Full Code Here

    return null;
  }

  public void processUrl(String url) {
    System.out.println("Processing: " + url);
    Page page = download(url);
    if (page != null) {
      ParseData parseData = page.getParseData();
      if (parseData != null) {
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;
          System.out.println("Title: " + htmlParseData.getTitle());
          System.out.println("Text length: " + htmlParseData.getText().length());
View Full Code Here

    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(robotsTxtUrl);
        fetchResult.fetchContent(page);
        if (Util.hasPlainTextContent(page.getContentType())) {
          try {
            String content;
            if (page.getContentCharset() == null) {
              content = new String(page.getContentData());
            } else {
              content = new String(page.getContentData(), page.getContentCharset());
            }
            directives = RobotstxtParser.parse(content, config.getUserAgentName());
          } catch (Exception e) {
            e.printStackTrace();
          }
View Full Code Here

    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(robotsTxtUrl);
        fetchResult.fetchContent(page);
        if (Util.hasPlainTextContent(page.getContentType())) {
          try {
            String content;
            if (page.getContentCharset() == null) {
              content = new String(page.getContentData());
            } else {
              content = new String(page.getContentData(), page.getContentCharset());
            }
            directives = RobotstxtParser.parse(content, config.getUserAgentName());
          } catch (Exception e) {
            e.printStackTrace();
          }
View Full Code Here

    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(robotsTxtUrl);
        fetchResult.fetchContent(page);
        if (Util.hasPlainTextContent(page.getContentType())) {
          try {
            String content;
            if (page.getContentCharset() == null) {
              content = new String(page.getContentData());
            } else {
              content = new String(page.getContentData(), page.getContentCharset());
            }
            directives = RobotstxtParser.parse(content, config.getUserAgentName());
          } catch (Exception e) {
            logger.error(e);
          }
View Full Code Here

TOP

Related Classes of edu.uci.ics.crawler4j.crawler.Page

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.