Package org.vietspider.html

Examples of org.vietspider.html.HTMLNode


    return newDocument;
  }

  public HTMLDocument[] extractRow(HTMLDocument document, NodePath[] nodePaths) {
    List<List<HTMLNode>> listHtmlValues = new ArrayList<List<HTMLNode>>();
    HTMLNode root = document.getRoot();

    for(int i = 0; i < nodePaths.length; i++) {
      List<HTMLNode> matchValues = matchNodes(root, nodePaths[i]);
      if(matchValues != null) listHtmlValues.add(matchValues);
    }

    if(listHtmlValues.size() == 0 || listHtmlValues.get(0) == null) return new HTMLDocument[0];

    List<HTMLNode> htmlValues = listHtmlValues.get(0);
    HTMLDocument [] newDocuments = new HTMLDocument[htmlValues.size()];

    for(int i = 0; i < htmlValues.size(); i++) {
      HTMLNode html = HTMLParser2.clone(root);
      if(htmlValues.get(i) == null) continue;
      html.addChild(htmlValues.get(i));
//      htmlValues.get(i).setParent(html);
      for(int j = 1; j < listHtmlValues.size(); j++) {
        List<HTMLNode> newHtmlValues = listHtmlValues.get(j);
        if(i > newHtmlValues.size()) break;
        try {
          if(newHtmlValues.get(i) == null) continue;
          html.addChild(newHtmlValues.get(i));
//          newHtmlValues.get(i).setParent(html);
        } catch (Exception e) {
          continue;
        }
      }
View Full Code Here


    for(; start > -1; start--) {
      NodeImpl node = tokens.get(start);
      if(node.isNode(Name.FORM)) break;
    }

    HTMLNode form = null;
    boolean md5 = false;
    List<HTMLNode> inputs = new ArrayList<HTMLNode>();
    String formValue = null;

    for(int i = start; i < tokens.size(); i++) {
      NodeImpl node = tokens.get(i);
      if(node.isNode(Name.FORM)) {
        if(node.isOpen()) {
          if(!md5) {
            String value = new String(node.getValue());
            md5 = value.toLowerCase().indexOf("md5") > -1;
          }
          form = node;
          formValue = new String(form.getValue()).toLowerCase();
        } else {
          break;
        }
      } else if(node.isNode(Name.INPUT)) {
        if(!md5 && formValue != null) {
View Full Code Here

      }
     
      Node node = (Node)inodes[i];
      for(HTMLNode htmlNode : htmlNodes) {
        if(htmlNode == null) continue;
        HTMLNode test = lookNode(htmlNode, node);
        if(test == null) {
          continue;
        }
        htmlValues.add(lookNode(htmlNode, node));
      }
View Full Code Here

//    System.out.println("node expresstion "+nodeExp.toString());
//    System.out.println("attributes length "+nodeExp.getAttributes().length);
    int counter  = 0;
    NodeMatcher matcher = new NodeMatcher();
    for(int i = 0; i < htmlChildren.size(); i++) {
      HTMLNode childNode =  htmlChildren.get(i);
      if(nodeExp.getName() != childNode.getName()) continue;
      if(matcher.match(nodeExp.getPattern(), counter)) {
        Attribute [] attrs = nodeExp.getAttributes();
        if(attrs == null || attrs.length < 1) {
//          System.out.println(" da xay ra roi ");
          htmlValues.add(childNode);
        } else {
//          System.out.println(" xay ra ");
          Attributes nodeAttributes = childNode.getAttributes();
          if(matcher.contains(nodeAttributes, attrs)) htmlValues.add(childNode);
        }
      }
      counter++;
   
View Full Code Here

      if(matchValues != null) nodes.addAll(matchValues);
    }

    for(HTMLNode node : nodes) {
      if(node == null) continue;
      HTMLNode parent  = node.getParent();
      if(parent == null) continue;
//      System.out.println(" truoc " + parent.getChildren().size());
      parent.removeChild(node);
//      System.out.println(" sau " + parent.getChildren().size());
    }
  }
View Full Code Here

//      System.out.println(" sau " + parent.getChildren().size());
    }
  }

  public void removeFrom(HTMLNode root, NodePath path){
    HTMLNode element = lookNode(root, path);
    if (element == null) return;
    java.util.Iterator<HTMLNode> iter =  element.getParent().getChildren().iterator();
    boolean remove = false;
    while(iter.hasNext()){
      HTMLNode ele = iter.next();
      if(!remove) remove = ele == element;
      if(remove) iter.remove();
    }
  }
View Full Code Here

    this.linkChecker = linkChecker;
    StringBuilder builder = new StringBuilder();
   
    NodeIterator iterator = root.iterator(ignores);
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      switch (node.getName()) {
      case CONTENT:
        char [] chars = node.getValue();
        if(!isEmpty(chars)) {
//          if(isValid(contents, node, constain)) {
            int start = builder.length();
            for(int k = 0; k < chars.length; k++) {
              builder.append(chars[k] == '\n' ? ' ' : chars[k]);
            }
            HTMLNode parent = node.getParent();
            if(parent != null && parent.isNode(Name.SPAN)) builder.append(' ');
           
            int end = builder.length();
            positions.add(new NodePosition(node, start, end));
          }
//        }
View Full Code Here

 
  private boolean isWrapperContent(HTMLNode node){
    List<HTMLNode> children = node.getChildren();
    if(children == null) return false;
    for(int i = 0; i < children.size(); i++) {
      HTMLNode child = children.get(i);
      if(child.isNode(Name.CONTENT)
         || isWrapperContent(child)) return true;
    }
    return false;
  }
View Full Code Here

  public static HTMLNode searchBody(HTMLDocument document) throws Exception {
    RefsDecoder decoder = new RefsDecoder();
    NodeIterator iterator = document.getRoot().iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.CONTENT)) continue;
      char [] chars = node.getValue();
      chars = decoder.decode(chars);

      chars = CharsUtil.cutAndTrim(chars, 0, chars.length);
      chars =  java.text.Normalizer.normalize(new String(chars), Normalizer.Form.NFC).toCharArray();
      node.setValue(chars);             
   

    HTMLExtractor extractor  = new HTMLExtractor();
    NodePathParser pathParser = new NodePathParser();
View Full Code Here

//      if(isValid(contents, node, constain)) {
        int start = builder.length();
        for(int k = 0; k < chars.length; k++) {
          builder.append(chars[k] == '\n' ? ' ' : chars[k]);
        }
        HTMLNode parent = node.getParent();
        if(parent != null && parent.isNode(Name.SPAN)) builder.append(' ');

        int end = builder.length();
        positions.add(new NodePosition(node, start, end));
      }
//    }
View Full Code Here

TOP

Related Classes of org.vietspider.html.HTMLNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.