Examples of edu.uci.ics.crawler4j.crawler.Page

edu.uci.ics.crawler4j.crawler.Page
@author Yasser Ganjisaffar

  private HTMLParser htmlParser = new HTMLParser();


  public Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    Page page = new Page(curURL);
    int statusCode = PageFetcher.fetch(page, true);
    if (statusCode == PageFetchStatus.OK) {
      try {
        htmlParser.parse(page.getHTML(), curURL.getURL());
        page.setText(htmlParser.getText());
        page.setTitle(htmlParser.getTitle());
        return page;
      } catch (Exception e) {
        e.printStackTrace();
      }
    }

View Full Code Here

    return null;
  }
  
  public static void main(String[] args) {
    Downloader myDownloader = new Downloader();
    Page page = myDownloader.download("http://ics.uci.edu");
    if (page != null) {
      System.out.println(page.getText());
    }
  }

View Full Code Here

  }
  
  private static HostDirectives fetchDirectives(String host) {
    WebURL robotsTxt = new WebURL();
    robotsTxt.setURL("http://" + host + "/robots.txt");
    Page page = new Page(robotsTxt);
    int statusCode = PageFetcher.fetch(page, true);
    HostDirectives directives = null;
    if (statusCode == PageFetchStatus.OK) {
      directives = RobotstxtParser.parse(page.getHTML(), USER_AGENT_NAME);      
    }
    if (directives == null) {
      // We still need to have this object to keep track of the time we fetched it
      directives = new HostDirectives();
    }

View Full Code Here

    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(curURL);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        try {
          Page page = new Page(curURL);
          fetchResult.fetchContent(page);
          if (parser.parse(page, curURL.getURL())) {
            return page;
          }
        } catch (Exception e) {

View Full Code Here

    return null;
  }


  public void processUrl(String url) {
    System.out.println("Processing: " + url);
    Page page = download(url);
    if (page != null) {
      ParseData parseData = page.getParseData();
      if (parseData != null) {
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;
          System.out.println("Title: " + htmlParseData.getTitle());
          System.out.println("Text length: " + htmlParseData.getText().length());

View Full Code Here

    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(robotsTxtUrl);
        fetchResult.fetchContent(page);
        if (Util.hasPlainTextContent(page.getContentType())) {
          try {
            String content;
            if (page.getContentCharset() == null) {
              content = new String(page.getContentData());
            } else {
              content = new String(page.getContentData(), page.getContentCharset());
            }
            directives = RobotstxtParser.parse(content, config.getUserAgentName());
          } catch (Exception e) {
            e.printStackTrace();
          }

View Full Code Here

    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(robotsTxtUrl);
        fetchResult.fetchContent(page);
        if (Util.hasPlainTextContent(page.getContentType())) {
          try {
            String content;
            if (page.getContentCharset() == null) {
              content = new String(page.getContentData());
            } else {
              content = new String(page.getContentData(), page.getContentCharset());
            }
            directives = RobotstxtParser.parse(content, config.getUserAgentName());
          } catch (Exception e) {
            e.printStackTrace();
          }

View Full Code Here

    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(robotsTxtUrl);
        fetchResult.fetchContent(page);
        if (Util.hasPlainTextContent(page.getContentType())) {
          try {
            String content;
            if (page.getContentCharset() == null) {
              content = new String(page.getContentData());
            } else {
              content = new String(page.getContentData(), page.getContentCharset());
            }
            directives = RobotstxtParser.parse(content, config.getUserAgentName());
          } catch (Exception e) {
            logger.error(e);
          }

View Full Code Here

TOP

Related Classes of edu.uci.ics.crawler4j.crawler.Page

edu.uci.ics.crawler4j.example.advanced.Downloader

edu.uci.ics.crawler4j.examples.localdata.Downloader

edu.uci.ics.crawler4j.robotstxt.RobotstxtServer

org.apache.http.Header

java.nio.charset.Charset

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.