Package net.matuschek.jobo

Source Code of net.matuschek.jobo.JoBoBase

package net.matuschek.jobo;

/************************************************
    Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/

import java.io.File;
import java.io.FileWriter;
import java.io.Writer;

import net.matuschek.http.DownloadRuleSet;
import net.matuschek.http.HttpDocToFile;
import net.matuschek.http.HttpToolCallback;
import net.matuschek.spider.RegExpURLCheck;
import net.matuschek.spider.WebRobot;
import net.matuschek.spider.WebRobotCallback;
import net.matuschek.spider.docfilter.FilterChain;
import net.matuschek.spider.docfilter.LinkLocalizer;

import org.apache.log4j.Category;

import org.exolab.castor.mapping.Mapping;
import org.exolab.castor.xml.Marshaller;
import org.exolab.castor.xml.Unmarshaller;

import org.xml.sax.InputSource;

/**
* This is a simple class that contains all needed features for JoBo
* (the web robot, the download rules, RegExpUrlCheck ...)
*
* @author Daniel Matuschek
* @version $Revision: 1.21 $
*/
public class JoBoBase  {

  /** Log4J logging */
  private static Category log = Category.getInstance("");

  /** The file used for XML->Java mapping */
  private static String mappingfile="mapping.xml";

  /** The jobo configuration in XML */
  private static String xmlconfig="jobo.xml";

  /** Start URL for the robot */
//  private static String startUrl=null;
 
  private String storageDirectory = "/tmp";
  private WebRobot robot = null;
  private RegExpURLCheck urlcheck = null;
  private DownloadRuleSet downloadrules = null;
  private HttpDocToFile docstore = null;

  /** Filter to localize included links */
  private LinkLocalizer linkLocalizer = null;

  /** FilterChains with all filters */
  private FilterChain filters = null;
     

  /**
   * @exception ClassNotFoundException if the Robot could not be instantiated
   * for some reason
   */
  public JoBoBase()
    throws ClassNotFoundException
  {
    log = Category.getInstance(this.getClass());
    docstore = new HttpDocToFile(storageDirectory);
    initializeFilters();
    robot = new WebRobot();
    robot.setFilters(filters);
  }


  /**
   * Set the default filter chain
   */
  public void initializeFilters() {
    filters = new FilterChain();
    linkLocalizer = new LinkLocalizer();
    filters.add(linkLocalizer);
  }


  /**
   * write the settings to an XML file
   */
  public void saveConfig(String filename) {
    File f1 = new File(mappingfile);

    if (f1.exists()) {
      Mapping mapping = new Mapping();
      try {
  mapping.loadMapping(mappingfile);
  Writer writer = new FileWriter(filename);
  Marshaller marshaller = new Marshaller(writer);
  marshaller.setMapping(mapping);
  marshaller.marshal(this);
  writer.close();
 
  log.info("written to XML");
      } catch (Exception e) {
  log.error(e.getMessage());
  e.printStackTrace();
      }
    } else {
      log.error("mapping and/or configfile not found");
    }
  }
 


  public void registerHttpToolCallback(HttpToolCallback cb) {
    robot.setHttpToolCallback(cb);
  }

  public void registerWebRobotCallback(WebRobotCallback cb) {
    robot.setWebRobotCallback(cb);
  }

  /**
   * registers the regexpurlcheck and the download rules with the robot
   */
  public void configureRobot() {
    robot.setURLCheck(urlcheck);
    robot.setDownloadRuleSet(downloadrules);
    robot.setDocManager(docstore);
    robot.setFilters(filters);
  }


 
  /**
   * Get the value of urlcheck.
   * @return Value of urlcheck.
   */
  public RegExpURLCheck getURLCheck () {
    return urlcheck;
  }
 
  /**
   * Set the value of urlcheck.
   * @param v  Value to assign to urlcheck.
   */
  public void setURLCheck(RegExpURLCheck  urlcheck ) {
    this.urlcheck = urlcheck;
  }
 
  /**
   * Get the value of robot.
   * @return Value of robot.
   */
  public WebRobot getRobot () {
    return robot;
  }
 
  /**
   * Set the value of robot. The new Robot will use the
   * filter that are defined in JoBoBase, even if he had
   * its own FilterChain before.
   *
   * @param robot WebRobot object to use
   */
  public void setRobot(WebRobot robot) {
    this.robot = robot;
    robot.setFilters(filters);
  }


  /**
   * Localize links ?
   *
   * @param localize if this is true, JoBo will trz to replace
   * absolute links by relative
   */
  public void setLocalizeLinks(boolean localize)
  {
    if (localize) {
      linkLocalizer.enable();
    } else {
      linkLocalizer.disable();
    }
  }


  /**
   * is link localization enabled ?
   */
  public boolean getLocalizeLinks() {
    return linkLocalizer.isEnabled();
  }


  /**
   * Get the value of downloadRules.
   * @return Value of downloadRules.
   */
  public DownloadRuleSet getDownloadRuleSet () {
    return downloadrules;
  }
 
  /**
   * Set the value of downloadRules.
   * @param v  Value to assign to downloadRules.
   */
  public void setDownloadRuleSet(DownloadRuleSet downloadRuleSet) {
    this.downloadrules = downloadRuleSet;
  }
 
  /**
   * Get the value of storageDirectory.
   * @return Value of storageDirectory.
   */
  public String getStorageDirectory () {
    return storageDirectory;
  }
 
  /**
   * Set the value of storageDirectory.
   * @param v  Value to assign to storageDirectory.
   */
  public void setStorageDirectory(String  storageDirectory ) {
    this.storageDirectory = storageDirectory;
    docstore.setBaseDir(storageDirectory);
  }


  /**
   * Enable/disable storing of dynamic documents (with an "?"
   * somewhere in the URL
   *
   * @param v true: enable storing of <b>all</b> documents,
   *          false: store only documents with an URL without "?"
   */
  public void setStoreCGI(boolean storeCGI) {
    this.docstore.setStoreCGI(storeCGI);
  }


  /**
   * Get the status of storeCGI
   *
   * @return the current status of storeCGI
   * @see #setStoreCGI for more information
   */
  public boolean getStoreCGI() {
    return this.docstore.getStoreCGI();
  }



  /**
   * Unmarshall the object from an XML file (jobo.xml) in the current
   * directory
   *
   * @exception ClassNotFoundException if the Robot could not be instantiated
   * for some reason  
   */
  public static JoBoBase createFromXML()
    throws ClassNotFoundException
  {
    return createFromXML(".");
  }


  /**
   * Unmarshall the object from an XML file
   *
   * @param configDirectory name of the directory where jobo.xml and
   * mapping.xml should be read from.
   * @exception ClassNotFoundException if the Robot could not be instantiated
   * for some reason  
   */
  public static JoBoBase createFromXML(String configDirectory)
    throws ClassNotFoundException
  {
    JoBoBase baseobj = null;
   
    xmlconfig="jobo.xml";

    File f1 = new File(configDirectory+File.separatorChar+mappingfile);
    File f2 = new File(configDirectory+File.separatorChar+xmlconfig);

    if (f1.exists() && f2.exists()) {
      Mapping mapping = new Mapping();
      try {
  mapping.loadMapping(f1.getPath());
  Unmarshaller unmar = new Unmarshaller(mapping);
  unmar.setDebug(true);
  baseobj=(JoBoBase)unmar.unmarshal(new InputSource(f2.getPath()));
 
  log.info("configured from XML");
       
      } catch (Exception e) {
  log.error(e.getMessage());
  e.printStackTrace();
      }
    } else {
      log.error("mapping and/or configfile not found");
    }
     
    if (baseobj==null) {
      baseobj = new JoBoBase();
    }

    baseobj.configureRobot();

    return baseobj;
  }




} // JoBoBase
TOP

Related Classes of net.matuschek.jobo.JoBoBase

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.