Package org.vietspider.html.renderer.content

Source Code of org.vietspider.html.renderer.content.TextAnalyticsRenderer

/***************************************************************************
* Copyright 2001-2008 The VietSpider         All rights reserved.       *
**************************************************************************/
package org.vietspider.html.renderer.content;

import java.util.List;

import org.vietspider.common.text.TextCounter;
import org.vietspider.html.HTMLNode;
import org.vietspider.html.Name;

/**
* Author : Nhu Dinh Thuan
*          nhudinhthuan@yahoo.com
* Sep 30, 2008 
*/
public final class TextAnalyticsRenderer {

  private String text;
 
  private int word = 0;
  private int paragraph = 0;
  private int sentence = 0;
 
  public TextAnalyticsRenderer (HTMLNode root) {
    StringBuilder builder = new StringBuilder();
    build(builder, root);
    text = builder.toString();
   
    processTextValue();
  }

  public void build(StringBuilder builder, HTMLNode node) {
    switch (node.getName()) {
    case CONTENT:
      char [] chars = node.getValue();
      if(!isEmpty(chars)) {
        for(int k = 0; k < chars.length; k++) {
          builder.append(chars[k] == '\n' ? ' ' : chars[k]);
        }
        HTMLNode parent = node.getParent();
        if(parent != null && parent.isNode(Name.SPAN)) builder.append(' ');
      }
      break;
    case IMG:
    case OBJECT:
    case H1:
    case H2:
    case H3:
    case H4:
    case H5:
    case H6:
    case BR:
    case P:
    case LI:  
      if(!isEndWithNewLine(builder)) {
        builder.append('\n');
      }
      break;
    case TR:
    case TABLE:
    case TD:
    case DIV:
      if(!isEndWithNewLine(builder)) {
        builder.append('\n');
      }
      break;
    case SCRIPT:
    case STYLE:
      return;
    default:
      if(builder.length() > 0) {
        char c = builder.charAt(builder.length()-1);
        if(!(Character.isWhitespace(c)
            || Character.isSpaceChar(c))) {
          builder.append(' ');
        }
      }
    break;
    }
    List<HTMLNode> children = node.getChildren();
    if(children == null) return;
    for(int i = 0; i < children.size(); i++) {
      build(builder, children.get(i));
    }
  }

  public String getTextValue() { return text; }

  private final boolean isEndWithNewLine(StringBuilder value) {
    int i = value.length()-1;
    while(i > -1) {
      char c = value.charAt(i);
      if(c == '\n') {
        return true;
      } else if(Character.isWhitespace(c)
          || Character.isSpaceChar(c)) {
        i--;
        continue;
      }
      return false;
    }
    return true;
  }

  private final boolean isEmpty(char [] chars) {
    int i = 0;
    while(i < chars.length) {
      if(Character.isWhitespace(chars[i])
          || Character.isSpaceChar(chars[i])) {
        i++;
        continue;
      }
      return false;
    }
    return true;
  }

  private void processTextValue() {
    String [] elements = text.split("\n");
    TextCounter textCounter = new TextCounter();
    for(int i = 0; i < elements.length; i++) {
      int value  = textCounter.countSentence(elements[i]);
      if(value > 1) {
//        System.out.println(" : == =======================================");
//        System.out.println("papa a "+ elements[i]);
        paragraph++;
      }
      sentence += value;
     
      word += textCounter.countWords(elements[i]);
    }
  }

  public int getWord() { return word; }

  public int getParagraph() { return paragraph; }

  public int getSentence() { return sentence; }
 
  public int compare(TextAnalyticsRenderer renderer) {
    int compare = paragraph - renderer.getParagraph();
    if(compare != 0) return compare;
    compare = sentence - renderer.getSentence();
    if(compare != 0) return compare;
    return word - renderer.getWord();
  }
 
}
TOP

Related Classes of org.vietspider.html.renderer.content.TextAnalyticsRenderer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.