Package edu.uci.ics.crawler4j.url

Examples of edu.uci.ics.crawler4j.url.WebURL


          movedToUrl = URLCanonicalizer.getCanonicalURL(movedToUrl);
          int newdocid = DocIDServer.getDocID(movedToUrl);
          if (newdocid > 0) {
            return PageFetchStatus.RedirectedPageIsSeen;
          } else {
            WebURL webURL = new WebURL();
            webURL.setURL(movedToUrl);
            webURL.setParentDocid(curURL.getParentDocid());
            webURL.setDepth((short) (curURL.getDepth()));
            webURL.setDocid(-1);
            if (shouldVisit(webURL) && RobotstxtServer.allows(webURL)) {
              webURL.setDocid(DocIDServer.getNewDocID(movedToUrl))
              Frontier.schedule(webURL);
            }
          }
        }
        return PageFetchStatus.Moved;
      } else if (statusCode == PageFetchStatus.PageTooBig) {
        logger.error("Page was bigger than max allowed size: " + curURL.getURL());
      }
      return statusCode;
    }

    try {
      if (!page.isBinary()) {
        htmlParser.parse(page.getHTML(), curURL.getURL());
        page.setText(htmlParser.getText());
        page.setTitle(htmlParser.getTitle());

        if (page.getText() == null) {
          return PageFetchStatus.NotInTextFormat;
        }

        Iterator<String> it = htmlParser.getLinks().iterator();
        List<WebURL> toSchedule = new ArrayList<WebURL>();
        List<WebURL> toList = new ArrayList<WebURL>();
        while (it.hasNext()) {
          String url = it.next();
          if (url != null) {
            int newdocid = DocIDServer.getDocID(url);
            if (newdocid > 0) {
              if (newdocid != docid) {
                WebURL webURL = new WebURL();
                webURL.setURL(url);
                webURL.setDocid(newdocid);
                toList.add(webURL);
              }
            } else {
              WebURL webURL = new WebURL();
              webURL.setURL(url);
              webURL.setDocid(-1);
              webURL.setParentDocid(docid);
              webURL.setDepth((short) (curURL.getDepth() + 1));             
              if (shouldVisit(webURL) && RobotstxtServer.allows(webURL)) {
                if (MAX_CRAWL_DEPTH == -1 || curURL.getDepth() < MAX_CRAWL_DEPTH) {
                  webURL.setDocid(DocIDServer.getNewDocID(url));
                  toSchedule.add(webURL);
                  toList.add(webURL);
                }
              }
            }
View Full Code Here


          int newdocid = DocIDServer.getDocID(uri);
          if (newdocid != -1) {
            if (newdocid > 0) {
              return PageFetchStatus.RedirectedPageIsSeen;
            }
            WebURL webURL = new WebURL();
            webURL.setURL(uri);
            webURL.setDocid(DocIDServer.getNewDocID(uri));
            page.setWebURL(webURL);
          }
        }
      }
View Full Code Here

  // and extract its title and text
 
  private HTMLParser htmlParser = new HTMLParser();

  public Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    Page page = new Page(curURL);
    int statusCode = PageFetcher.fetch(page, true);
    if (statusCode == PageFetchStatus.OK) {
      try {
        htmlParser.parse(page.getHTML(), curURL.getURL());
        page.setText(htmlParser.getText());
        page.setTitle(htmlParser.getTitle());
        return page;
      } catch (Exception e) {
        e.printStackTrace();
View Full Code Here

  public static void setActive(boolean active) {
    RobotstxtServer.active = active;
  }
 
  private static HostDirectives fetchDirectives(String host) {
    WebURL robotsTxt = new WebURL();
    robotsTxt.setURL("http://" + host + "/robots.txt");
    Page page = new Page(robotsTxt);
    int statusCode = PageFetcher.fetch(page, true);
    HostDirectives directives = null;
    if (statusCode == PageFetchStatus.OK) {
      directives = RobotstxtParser.parse(page.getHTML(), USER_AGENT_NAME);     
View Full Code Here

    if (docid > 0) {
      // This URL is already seen.
      return;
    }

    WebURL webUrl = new WebURL();
    webUrl.setURL(canonicalUrl);
    docid = DocIDServer.getNewDocID(canonicalUrl);
    webUrl.setDocid(docid);
    webUrl.setDepth((short) 0);
    if (!RobotstxtServer.allows(webUrl)) {
      logger.info("Robots.txt does not allow this seed: " + pageUrl);
    } else {
      Frontier.schedule(webUrl);
    }
View Full Code Here

        cursor = urlsDB.openCursor(txn, null);
        result = cursor.getFirst(key, value, null);

        while (matches < max && result == OperationStatus.SUCCESS) {
          if (value.getData().length > 0) {
            WebURL curi = (WebURL) webURLBinding.entryToObject(value);
            results.add(curi);
            matches++;
          }
          result = cursor.getNext(key, value, null);
        }
View Full Code Here

  public static void scheduleAll(List<WebURL> urls) {
    synchronized (mutex) {
      Iterator<WebURL> it = urls.iterator();
      while (it.hasNext()) {
        WebURL url = it.next();
        if (maxPagesToFetch < 0 || scheduledPages < maxPagesToFetch) {         
          try {
            workQueues.put(url);
            scheduledPages++;
          } catch (DatabaseException e) {
View Full Code Here

public final class WebURLTupleBinding extends TupleBinding<WebURL> {

  @Override
  public WebURL entryToObject(TupleInput input) {
    WebURL webURL = new WebURL();
    webURL.setURL(input.readString());
    webURL.setDocid(input.readInt());
    webURL.setParentDocid(input.readInt());
    webURL.setDepth(input.readShort());
    return webURL;
  }
View Full Code Here

    parser = new Parser(config);
    pageFetcher = new PageFetcher(config);
  }

  private Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(curURL);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        try {
          Page page = new Page(curURL);
          fetchResult.fetchContent(page);
          if (parser.parse(page, curURL.getURL())) {
            return page;
          }
        } catch (Exception e) {
          e.printStackTrace();
        }
View Full Code Here

*/
public class WebURLTupleBinding extends TupleBinding<WebURL> {

  @Override
  public WebURL entryToObject(TupleInput input) {
    WebURL webURL = new WebURL();
    webURL.setURL(input.readString());
    webURL.setDocid(input.readInt());
    webURL.setParentDocid(input.readInt());
    webURL.setParentUrl(input.readString());
    webURL.setDepth(input.readShort());
    webURL.setPriority(input.readByte());
    return webURL;
  }
View Full Code Here

TOP

Related Classes of edu.uci.ics.crawler4j.url.WebURL

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.