Package org.vietspider.html.renderer.checker

Source Code of org.vietspider.html.renderer.checker.LinkBlockChecker

/***************************************************************************
* Copyright 2001-2009 The VietSpider         All rights reserved.       *
**************************************************************************/
package org.vietspider.html.renderer.checker;

import java.util.ArrayList;
import java.util.List;

import org.vietspider.common.text.TextCounter;
import org.vietspider.html.HTMLNode;
import org.vietspider.html.Name;
import org.vietspider.html.NodeIterator;
import org.vietspider.html.renderer.content.AnalyticsRenderer;
import org.vietspider.token.attribute.Attribute;
import org.vietspider.token.attribute.Attributes;

/**
* Author : Nhu Dinh Thuan
*          nhudinhthuan@yahoo.com
* Feb 10, 2009 
*/
public class LinkBlockChecker {
 
  protected ContentChecker contentChecker;
 
  public LinkBlockChecker(ContentChecker contentChecker) {
    this.contentChecker = contentChecker;
  }
 
  boolean isLink(CheckModel model) {
    HTMLNode node = model.getNode();
    List<HTMLNode> links = new ArrayList<HTMLNode>();
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isNode(Name.A)) links.add(n);
      else if(n.isNode(Name.DIV)
          || n.isNode(Name.SPAN)) {
        if(hasOnclick(n)) {
          links.add(n);
        }
      }
    }
   
    model.setTotalOfLink(links.size());
   
    if(links.size() > 15) {
      if(!hasParagraph(node, 2)) {
        if(model.getNode().getTextValue().indexOf("XUÂN YÊU") > -1) {
          System.out.println("===================================================" );
          System.out.println(model.getNode().getTextValue());
        }
        return true;
      }
//      return true;
    }
   
    if(links.size() >= 3) {
      if(!hasParagraph(node, 1)) return true
    }
   
   
    int rate = compareNodes(links);
//    if(model.getNode().getTextValue().indexOf("u tượng của phần mền trong control panel") > -1) {
//    System.out.println("===================================================" + rate);
//    System.out.println(model.getNode().getTextValue());
//    }
//    if(rate > 0) {
//      System.out.println("================================================");
//      System.out.println("ti le cai nay " + rate);
//      System.out.println(model.getNode().getTextValue());
//    }
    if(rate >= 75) {
      if(hasBlockLink(node)) {
//        if(model.getNode().getTextValue().indexOf("XUÂN YÊU") > -1) {
//          System.out.println("===================================================" + rate);
//          System.out.println(model.getNode().getTextValue());
//        }
        return false;
      }
      return true;
    }
   
    if(rate < 40) {
      return false;
    }
//    int countWord =
    return false;
  }
 
  private boolean hasOnclick(HTMLNode node) {
    Attributes attributes = node.getAttributes();
    Attribute attribute = attributes.get("onclick");
    if(attribute == null) return false;
    String value  = attribute.getValue();
    if(value == null || (value = value.trim()).isEmpty()) return false;
    return true;
  }
 
  private int compareNodes(List<HTMLNode> links) {
    if(links.size() < 3) return 0;
    Attributes attributes = links.get(0).getAttributes();
    int counter = 1;
    for(int i = 1; i < links.size(); i++) {
      Attributes attributes1 = links.get(i).getAttributes();
      if(compareAttributes(attributes, attributes1)) counter++;
    }
    return (counter*100)/links.size() ;
  }
 
  private boolean compareAttributes(Attributes attributes1, Attributes attributes2) {
    if(attributes1.size() != attributes2.size()) return false;
    for(int i = 0; i < attributes1.size(); i++) {
      Attribute attr1 = attributes1.get(i);
      Attribute attr2 = attributes2.get(i);
      String name = attr1.getName();
      if(isIgnoreAttribute(name)) continue;
      if(!name.equalsIgnoreCase(attr2.getName()))  return false;
      if(isIgnoreAttributeValue(name)) continue;
      if(!attr1.getValue().equalsIgnoreCase(attr2.getValue()))  return false;
    }
    return true;
  }
 
  private boolean isIgnoreAttribute(String name) {
    if("href".equalsIgnoreCase(name)) return true;
    if("onclick".equalsIgnoreCase(name)) return true;
    return false;
  }
 
  private boolean isIgnoreAttributeValue(String name) {
    if("class".equalsIgnoreCase(name)) return false;
    return true;
  }
 
  public boolean hasParagraph(HTMLNode node, int max) {
    AnalyticsRenderer renderer = new AnalyticsRenderer(node, true);
    if(renderer.getParagraph() < 1) return false;

    String [] elements = renderer.getTextValue().toString().split("\n");
    TextCounter textCounter = contentChecker.getTextCounter();
    for(String element : elements) {
      int counter = textCounter.countSentence(element);
      if(counter > max) return true;

      counter = textCounter.countWord(element, 0, element.length());
      if(counter >= 15) return true;
    }
    return false;
  }
 
  private boolean hasBlockLink(HTMLNode node) {
    List<HTMLNode> children  = node.getChildren();
    if(children == null) return false;
    for(int i = 0; i < children.size(); i++) {
      switch (children.get(i).getName()) {
      case DIV:
      case TABLE:
        if(isBlockLink(children.get(i))) return true;
        break;
      default:
        break;
      }
    }
    for(int i = 0; i < children.size(); i++) {
      if(hasBlockLink(children.get(i))) return true;
    }
    return false;
  }
 
  private boolean isBlockLink(HTMLNode node) {
    NodeIterator iterator = node.iterator();
    int counter = 0;
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isNode(Name.A)) counter++;
    }
    return counter > 3;
  }
 
}

TOP

Related Classes of org.vietspider.html.renderer.checker.LinkBlockChecker

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.