Package net.matuschek.html

Examples of net.matuschek.html.HtmlDocument


      // extract links
      try {
        if (doc.isHTML() && (depth > 0)) {
          // solving encoding problem
          // HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
          HtmlDocument htmlDoc = null;
          HttpHeader contentTypeHeader = doc.getHeader("Content-type");
          if (contentTypeHeader != null) {
            String contentType = contentTypeHeader.getValue();
            int index = contentType.toLowerCase().indexOf("charset=");
            if (index > 0) {
              htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
            } else {
              htmlDoc = new HtmlDocument(u, doc.getContent());
            }
          } else {
            htmlDoc = new HtmlDocument(u, doc.getContent());
          }
 
          // add links
         
          // this depth-check is critical!
          // otherwise far too many RobotTasks will be created
          // this will cause a premature OutOfMemoryException!
          if (depth > 0) {
            if (duplicate != null) {
              HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
              doc.setLinks(linksDoc.getLinks());
            } else if (cached) {
            }
            if (links == null) {
              links = htmlDoc.getLinks();
              doc.setLinks(links);
            }
            if (duplicate == null) {
              HashSet checkedLinks = new HashSet();
              for (int i = 0; i < links.size(); i++) {
                URL link = (URL) links.elementAt(i);
                log.info("Link: "+link);
                // check already here for duplicate links to avoid expensive
                // creation of RobotTasks
                if (!checkedLinks.contains(link)) {
                  checkedLinks.add(link);
                  String myReferer = u.toString();
                  if (u.getUserInfo() != null) {
                    // remove userinfo from referer
                    int endindex = myReferer.indexOf("@")+1;
                    myReferer = "http://"+ myReferer.substring(endindex);
                  }
                 
                  RobotTask newTask = createRobotTask((URL) links.elementAt(i), depth - 1, myReferer);
                  // check already here for visited tasks to save memory
                  if (!visited.contains(newTask)) {
                    // bad workaround to retrieve images first
                    if (newTask.urlString.endsWith(".jpg")) {
                      addTaskAtStart(newTask);
                    } else {
                      addTask(newTask);
                    }
                  }
                }
              }
            }
          }
         
          if (hasFormHandlers) {
            // add forms
            Vector forms = htmlDoc.getElements("form");
            for (int i = 0; i < forms.size(); i++) {
              ExtendedURL eurl = formFiller.fillForm(u, (Element) forms.elementAt(i));
              if (eurl != null) {
                RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString());
                newTask.setParamString(eurl.getParams());
View Full Code Here


    BasicConfigurator.configure();
   
    HttpTool tool = new HttpTool();
    HttpDoc doc = tool.retrieveDocument(new URL("http://usul27:a1rrakis@www.atkpremium.com/members/styles/standard/pages/index.php?thispage=modelupdate&thisupdate=083735&thismodel=len004"),
          HttpConstants.GET,null);
    HtmlDocument html=new HtmlDocument(new URL("http://localhost"), doc.getContent());
    for (URL u: html.getLinks()) {
      System.out.println(u);
    }
   
    //    System.out.println(doc);
  }
View Full Code Here

TOP

Related Classes of net.matuschek.html.HtmlDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.