Package fr.eolya.utils.http

Source Code of fr.eolya.utils.http.HttpStream

package fr.eolya.utils.http;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Vector;
import java.util.zip.GZIPInputStream;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

import fr.eolya.extraction.CharsetRecognizer;

public class HttpStream {

  private InputStream input = null;
  private String rawData = null;
  private String charSet = null;
  private String declaredLanguage = null;
  private String declaredEncoding = null;
  private String contentType = null;
  private File tempFile = null;
  private String metaEncodingBalise = null;

  public HttpStream (InputStream input, String declaredEncoding, String contentType, String contentEncoding)
  {
    this.input = input;

    if (contentEncoding!=null && "gzip".equals(contentEncoding)) {
      try {
        this.input = new GZIPInputStream(input);
      } catch (IOException e) {
        e.printStackTrace();
      }
    }

    this.declaredEncoding = declaredEncoding;
    this.contentType = contentType;
  }

  public void clear()
  {
    if (tempFile!=null)
      tempFile.delete();
  }

  public String getCharSet()
  {
    if (charSet!=null)
      return charSet;

    String tempExt = ".txt";
    Hashtable<String,Integer> encodingFreq = new Hashtable<String,Integer>();

    String encodingInContentType = "";

    try{   
      if (contentType == null) contentType = "";
      contentType = contentType.toLowerCase();

      if (contentType.startsWith("text/html"))
      {
        encodingInContentType = HttpUtils.parseCharacterEncoding(contentType);
        if (encodingInContentType!=null && !"".equals(encodingInContentType))
          encodingFreq.put(encodingInContentType, 1);
        contentType = "text/html";
        tempExt = ".html";
     

      // On sauvegarde le flux dans un fichier temporaire
      tempFile = File.createTempFile("tmp", tempExt);
      DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile)));
      int c;
      while((c = input.read()) != -1) {
        out.writeByte(c);
      }
      input.close();
      out.close();

      // charset detection with jchardet
      BufferedInputStream bis = new BufferedInputStream(new FileInputStream(tempFile));     
      String[] aCharSet = CharsetRecognizer.detect(0, bis);
      bis.close();
      String encoding = HttpUtils.filtreEncoding(aCharSet[0].toLowerCase());
      if (encoding!=null && !"".equals(encoding) && !"nomatch".equals(encoding))
      {
        if (encodingFreq.containsKey(encoding))
          encodingFreq.put(encoding, encodingFreq.get(encoding) + 2);
        else
          encodingFreq.put(encoding, 2);
      }

      // charset detection with icu
      try {
        bis = new BufferedInputStream(new FileInputStream(tempFile));
        CharsetDetector detector;
        detector = new CharsetDetector();
        detector.enableInputFilter(true);
        detector.setText(bis);
        if (declaredEncoding!=null && !"".equals(declaredEncoding))
          detector.setDeclaredEncoding(declaredEncoding);
        CharsetMatch[] matches = null;
        matches = detector.detectAll();
        bis.close();
        encoding = HttpUtils.filtreEncoding(matches[0].getName().toLowerCase());
        if (encoding!=null && !"".equals(encoding))
        {
          if (encodingFreq.containsKey(encoding))
            encodingFreq.put(encoding, encodingFreq.get(encoding) + 2);
          else
            encodingFreq.put(encoding, 2);
        }
      } catch (Exception e) {
        //e.printStackTrace();
      }

      // lecture du charset dans la balise contentType dans le fichier
      //FileReader fr = new FileReader(tempFile) ;

      InputStreamReader fr = null;
      if (encodingInContentType!=null && !"".equals(encodingInContentType)) {
        fr = new InputStreamReader(new FileInputStream(tempFile), encodingInContentType);
      }
      else {
        fr = new InputStreamReader(new FileInputStream(tempFile), "UTF-8");       
      }

      BufferedReader br = new BufferedReader(fr);

      boolean find = false;
      while(br.ready() && !find){
        String line = br.readLine();
        String lineLowerCase = line.toLowerCase();
        //if (lineLowerCase.indexOf("<meta")>=0 && lineLowerCase.indexOf("http-equiv")>=0 && lineLowerCase.indexOf("content-type")>=0 )
        if (lineLowerCase.indexOf("charset")>=0 && lineLowerCase.indexOf("http-equiv")>=0 && lineLowerCase.indexOf("content-type")>=0 )
        {

          int off = Math.max(lineLowerCase.indexOf("charset"), lineLowerCase.indexOf("http-equiv"));
          off = Math.max(off, lineLowerCase.indexOf("content-type"));
          if (lineLowerCase.indexOf(">", off)!=-1) {
            //          if (lineLowerCase.indexOf(">", off)==-1 && br.ready())
            //          {
            //            // la balise ne se termine pas sur cette ligne, on ajoute la suivante
            //            line += "\n" + br.readLine();
            //            lineLowerCase = line.toLowerCase();
            //          }
            //For test only. lineLowerCase = "<meta content=\"text/html; charset=iso-8859-1\" http-equiv=\"content-type\">";         
            encoding = HttpUtils.parseCharacterEncoding(lineLowerCase);
            if (encoding!=null && !"".equals(encoding))
            {
              if (encodingFreq.containsKey(encoding))
                encodingFreq.put(encoding, encodingFreq.get(encoding) + 3);
              else
                encodingFreq.put(encoding, 3);
            }
            //          int start = lineLowerCase.indexOf("<meta");
            //          boolean find2=false;
            //          while (start>=0 && !find2)
            //          {
            //            int stop = lineLowerCase.indexOf(">", start);
            //            metaEncodingBalise = line.substring(start, stop+1);
            //            if (metaEncodingBalise.toLowerCase().indexOf("<meta")>=0 && metaEncodingBalise.toLowerCase().indexOf("http-equiv")>=0 && metaEncodingBalise.toLowerCase().indexOf("content-type")>=0 )
            //              find2 = true;
            //            start = lineLowerCase.indexOf("<meta", start+1);
            //          }
            find = true;
          }
        }
      }
      br.close() ;

      // Get the best candidate
      Vector<String> v = new Vector<String>(encodingFreq.keySet());
      Iterator<String> it = v.iterator();
      int max = 0;
      encoding = "";
      while (it.hasNext()) {
        String element =  (String)it.next();
        //System.out.println( element + " " + encodingFreq.get(element));
        if (encodingFreq.get(element)>max)
        {
          max = encodingFreq.get(element);
          encoding = element;
        }
      }
      this.charSet = encoding;
      return charSet;
    }
    catch(Exception e)
    {
      e.printStackTrace();
   

    return "";

  }

  public String getString()
  {
    if (rawData!=null)
      return rawData;

    try
      if (contentType == null) contentType = "";
      contentType = contentType.toLowerCase();

      if (charSet==null){
        getCharSet()
      }

      StringBuffer buffer = new StringBuffer();
      BufferedInputStream bis = new BufferedInputStream(new FileInputStream(tempFile))

      InputStreamReader isr = null;
      try {
        isr = new InputStreamReader(bis, charSet);
      }
      catch (Exception e) {
        isr = new InputStreamReader(bis, "UTF-8");
      }
      //if (isr==null) return null;

      int ch;
      while ((ch=isr.read())>-1)
      {
        buffer.append((char)ch);
      }
      isr.close();
      String ret = buffer.toString();

      if (contentType.startsWith("text/html") && metaEncodingBalise!=null && !"".equals(metaEncodingBalise))
      {
        // on met le bon charset dans la balise contenttype
        ret = ret.replace(metaEncodingBalise, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + charSet + "\">" );       
      }
      rawData = ret;
      return ret;

    }
    catch(Exception e)
    {
      e.printStackTrace();
   

    return "";
  }


  public String getDeclaredLanguage()
  {
    if (declaredLanguage!=null)
      return declaredLanguage;

    if (rawData==null)
      getString();

    return HttpUtils.getHtmlDeclaredLanguage(rawData);
  }


  //  public static void main(String[] args) throws IOException {
  //   
  //    InputStreamReader fr = null;
  //    fr = new InputStreamReader(new FileInputStream("/tmp/tmp251061790944559968.html"), "ISO-8859-1");
  // 
  //    BufferedReader br = new BufferedReader(fr);
  //
  //    boolean find = false;
  //    while(br.ready() && !find){
  //      String line = br.readLine();
  //      String lineLowerCase = line.toLowerCase();
  //
  //    }
  //    br.close() ;
  //  }
}
TOP

Related Classes of fr.eolya.utils.http.HttpStream

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.