Source Code of org.exoplatform.test.crawler.SingleCrawlThread

/**
 * Copyright (C) 2009 eXo Platform SAS.
 * 
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 * 
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */


package org.exoplatform.test.crawler;


import org.exoplatform.services.html.HTMLDocument;
import org.exoplatform.services.html.parser.HTMLParser;
import org.exoplatform.services.html.path.NodePath;
import org.exoplatform.services.html.path.NodePathUtil;


import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;


/**
 * Created by The eXo Platform SARL
 * Author : Lai Van Khoi
 *          laivankhoi46pm1@yahoo.com
 * Dec 1, 2006  
 */
public class SingleCrawlThread extends Thread
{


   private String url_;


   private String charset_ = "utf-8";


   private NodePath childPath_; //Note. Can replace it with: NodePath[] childPath.


   private int index_ = 0;


   private ByteArrayOutputStream data;


   public SingleCrawlThread()
   {
      new Thread(this).start(); //Note.
   }


   //---------------------------------------
   public void run()
   {
      while (true)
      {
         try
         {
            if (this.url_ != null)
            {
               System.out.println("Start download " + this.url_);
               //Get data from downloading through a specified URL.
               data = this.loadInputStream(new URL(this.url_).openStream());
               this.url_ = null;
            }
            Thread.sleep(1000);
         }
         catch (Exception exp)
         {
            exp.printStackTrace();
         }
      }
   }


   //--------------------------------------
   private ByteArrayOutputStream loadInputStream(InputStream input) throws Exception
   {
      ByteArrayOutputStream output = new ByteArrayOutputStream();


      BufferedInputStream buffer = new BufferedInputStream(input);
      byte[] bytes = new byte[buffer.available()];
      int available = -1;
      while ((available = buffer.read(bytes)) > -1)
      {
         output.write(bytes, 0, available);
      }
      return output;
   }


   //-----------------------------------
   public void startDownload(String url)
   {
      this.url_ = url;
   }


   //----------------------------------
   public void startDownload(String url, int idx, NodePath childPath, String charset)
   {
      this.url_ = url;
      this.index_ = idx;
      this.childPath_ = childPath;
      this.charset_ = charset;
   }


   //---------------------------------
   public boolean isComplete()
   {
      return (this.url_ == null);
   }


   //---------------------------------
   public void saveData() throws Exception
   {
      String fileName = String.valueOf(this.index_) + ".htm";
      this.saveData(fileName);
   }


   //---------------------------------
   public void saveData(String fileName) throws Exception
   {
      if (this.data == null || this.data.size() < 1)
         return;


      File file = new File(fileName);
      System.out.println("FILE PATH: " + file.getAbsolutePath());
      //FileOutputStream is meant for writing streams of raw bytes of data such as image data.
      //For writing streams of characters, consider using FileWriter.
      FileOutputStream output = new FileOutputStream(file);


      //The initial whole HTMLDocument
      HTMLDocument document = HTMLParser.createDocument(this.data.toByteArray(), this.charset_);


      //The new HTMLDocument after spliting (separating) only childPath<NodePath>-->only a Node.
      document = NodePathUtil.create(document.getRoot(), new NodePath[]{this.childPath_});


      output.write(document.getTextValue().getBytes("utf-8"));
      output.flush();


      output.close();
      this.data = null;
   }


   public ByteArrayOutputStream getData()
   {
      return this.data;
   }
}
Source Code of org.exoplatform.test.crawler.SingleCrawlThread

Related Classes of org.exoplatform.test.crawler.SingleCrawlThread