Package org.exoplatform.test.crawler

Source Code of org.exoplatform.test.crawler.SingleCrawlThread

/**
* Copyright (C) 2009 eXo Platform SAS.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/

package org.exoplatform.test.crawler;

import org.exoplatform.services.html.HTMLDocument;
import org.exoplatform.services.html.parser.HTMLParser;
import org.exoplatform.services.html.path.NodePath;
import org.exoplatform.services.html.path.NodePathUtil;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;

/**
* Created by The eXo Platform SARL
* Author : Lai Van Khoi
*          laivankhoi46pm1@yahoo.com
* Dec 1, 2006 
*/
public class SingleCrawlThread extends Thread
{

   private String url_;

   private String charset_ = "utf-8";

   private NodePath childPath_; //Note. Can replace it with: NodePath[] childPath.

   private int index_ = 0;

   private ByteArrayOutputStream data;

   public SingleCrawlThread()
   {
      new Thread(this).start(); //Note.
   }

   //---------------------------------------
   public void run()
   {
      while (true)
      {
         try
         {
            if (this.url_ != null)
            {
               System.out.println("Start download " + this.url_);
               //Get data from downloading through a specified URL.
               data = this.loadInputStream(new URL(this.url_).openStream());
               this.url_ = null;
            }
            Thread.sleep(1000);
         }
         catch (Exception exp)
         {
            exp.printStackTrace();
         }
      }
   }

   //--------------------------------------
   private ByteArrayOutputStream loadInputStream(InputStream input) throws Exception
   {
      ByteArrayOutputStream output = new ByteArrayOutputStream();

      BufferedInputStream buffer = new BufferedInputStream(input);
      byte[] bytes = new byte[buffer.available()];
      int available = -1;
      while ((available = buffer.read(bytes)) > -1)
      {
         output.write(bytes, 0, available);
      }
      return output;
   }

   //-----------------------------------
   public void startDownload(String url)
   {
      this.url_ = url;
   }

   //----------------------------------
   public void startDownload(String url, int idx, NodePath childPath, String charset)
   {
      this.url_ = url;
      this.index_ = idx;
      this.childPath_ = childPath;
      this.charset_ = charset;
   }

   //---------------------------------
   public boolean isComplete()
   {
      return (this.url_ == null);
   }

   //---------------------------------
   public void saveData() throws Exception
   {
      String fileName = String.valueOf(this.index_) + ".htm";
      this.saveData(fileName);
   }

   //---------------------------------
   public void saveData(String fileName) throws Exception
   {
      if (this.data == null || this.data.size() < 1)
         return;

      File file = new File(fileName);
      System.out.println("FILE PATH: " + file.getAbsolutePath());
      //FileOutputStream is meant for writing streams of raw bytes of data such as image data.
      //For writing streams of characters, consider using FileWriter.
      FileOutputStream output = new FileOutputStream(file);

      //The initial whole HTMLDocument
      HTMLDocument document = HTMLParser.createDocument(this.data.toByteArray(), this.charset_);

      //The new HTMLDocument after spliting (separating) only childPath<NodePath>-->only a Node.
      document = NodePathUtil.create(document.getRoot(), new NodePath[]{this.childPath_});

      output.write(document.getTextValue().getBytes("utf-8"));
      output.flush();

      output.close();
      this.data = null;
   }

   public ByteArrayOutputStream getData()
   {
      return this.data;
   }
}
TOP

Related Classes of org.exoplatform.test.crawler.SingleCrawlThread

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.