Package org.vietspider.html.parser

Source Code of org.vietspider.html.parser.HTMLParser

/*
* Copyright 2004-2006 The VietSpider        All rights reserved.
*
* Created on January 24, 2006, 7:48 PM
*/

package org.vietspider.html.parser;

import java.io.File;
import java.io.InputStream;
import java.util.List;

import org.vietspider.chars.CharsDecoder;
import org.vietspider.chars.jchardet.nsDetector;
import org.vietspider.chars.jchardet.nsICharsetDetectionObserver;
import org.vietspider.chars.jchardet.nsPSMDetector;
import org.vietspider.common.io.DataReader;
import org.vietspider.html.HTMLDocument;
import org.vietspider.html.HTMLNode;
/**
* @author nhuthuan
* Email: nhudinhthuan@yahoo.com
*/
@Deprecated()
public final class HTMLParser {

  private static String charset_ = null;
 
  @Deprecated()
  public static HTMLNode clone(HTMLNode node){
//    NodeImpl nodeImpl = (NodeImpl)node;
//    int type = nodeImpl.getType();
//    if(type == TypeToken.CONTENT
//        || type == TypeToken.COMMENT || type == TypeToken.CODE_CONTENT) {
//      return new NodeImpl(nodeImpl.getValue(), nodeImpl.getName());
//    }
//    return new NodeImpl(nodeImpl.getValue(), nodeImpl.getName(), nodeImpl.getType());
    return HTMLParser2.clone(node);
  }
 
  @Deprecated()
  public static synchronized List<NodeImpl> createTokens(char[] data) throws Exception {
    return new HTMLParser2().createTokens(data);
//    CharsToken tokens = new CharsToken();
//    ParserService.getTokenParser().createBeans(tokens, data, new TokenCreator(tokens));
//    List<NodeImpl> list = new ArrayList<NodeImpl>();
//    while(tokens.hasNext()){   
//      list.add(tokens.pop());
//    }
//    return list;
  }
 
  @Deprecated()
  public static synchronized HTMLDocument createDocument(char[] data) throws Exception {
    return new HTMLParser2().createDocument(data);
//    CharsToken tokens = new CharsToken();
//    ParserService.getTokenParser().createBeans(tokens, data, new TokenCreator(tokens));
//    ParserService.parse(tokens, tokens.getDocument());
//    return tokens.getDocument();
  }
 
  @Deprecated()
  public static HTMLDocument createDocument(CharsToken tokens) throws Exception {
    return new HTMLParser2().createDocument(tokens);
//    ParserService.parse(tokens, tokens.getDocument());
//    return tokens.getDocument();
  }
 
  @Deprecated()
  public static HTMLDocument createDocument(List<NodeImpl> list) throws Exception {
   /* CharsToken tokens = new CharsToken();
    for(int i = 0; i < list.size(); i++) {
      tokens.push(list.get(i));
    }
    ParserService.parse(tokens, tokens.getDocument());
    return tokens.getDocument();*/
    return  new HTMLParser2().createDocument(list);
  }

  public static HTMLDocument createDocument(String text) throws Exception {
    return createDocument(text.toCharArray());
  }

  public static HTMLDocument createDocument(byte[] data, String charset) throws Exception {
    if(charset == null) charset = detect(data);
    char [] chars = CharsDecoder.decode(charset, data, 0, data.length);
    return createDocument(chars);
 

  @Deprecated()
  public static HTMLDocument createDocument(InputStream input, String charset) throws Exception {
    DataReader reader = new DataReader();
    return createDocument(reader.loadInputStream(input).toByteArray(), charset)
  }

  @Deprecated()
  public static HTMLDocument createDocument(File file, String charset) throws Exception {
    DataReader reader = new DataReader();
    return createDocument(reader.load(file), charset);
 

  @Deprecated()
  public static String detect(byte [] buf){
    nsDetector det = new nsDetector(nsPSMDetector.ALL) ;
    charset_ = null;
    det.Init(new nsICharsetDetectionObserver() {
      @Override
      public void Notify(String charset) {
        charset_ = charset;
       
      }
    });

    boolean isAscii = true ;
    int len = buf.length;
   
    isAscii = det.isAscii(buf, len);  
    if (!isAscii) det.DoIt(buf, len, false);
    det.DataEnd();
   
    if (isAscii) charset_ = "ASCII";
    return charset_;
  }
}
TOP

Related Classes of org.vietspider.html.parser.HTMLParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.