Source Code of org.archive.access.nutch.jobs.NutchwaxLinkDb

package org.archive.access.nutch.jobs;


import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.sql.SQLException;
import java.util.Random;
import java.util.Arrays;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.crawl.LinkDbFilter;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchJob;
import org.archive.access.nutch.Nutchwax;
import org.archive.access.nutch.NutchwaxConfiguration;
import org.archive.access.nutch.jobs.sql.SqlSearcher;
import org.apache.nutch.global.Global;




/**
 * Subclass of nutch indexer that writes out LinkDb keys that include the
 * collection name.
 * Bulk of code is a copy and paste from LinkDb. LinkDb is not amenable to
 * subclassing.
 * @author stack
 */
public class NutchwaxLinkDb extends LinkDb
{
  private int nwMaxAnchorLength;
  private boolean nwIgnoreInternalLinks;
  private URLFilters nwUrlFilters;
  private URLNormalizers nwUrlNormalizers;
  private SqlSearcher sqlsearcher;
  private String collectionType;
  private String databaseConnection;
  private String databaseUsername;
  private String databasePassword;
  


  public NutchwaxLinkDb()
  {
    super(null);
  }


  /** Construct an LinkDb. */
  public NutchwaxLinkDb(Configuration conf)
  {
    super(conf);
  }


  public void configure(JobConf job)
  {
    super.configure(job);
    
    // These config. are private in parent class.  Make copy here in this
    // class with a 'nw' prefix.  St.Ack.
    this.nwMaxAnchorLength = job.getInt("db.max.anchor.length", 100);
    this.nwIgnoreInternalLinks =
      job.getBoolean("db.ignore.internal.links", true);
      
    if (job.getBoolean(LinkDbFilter.URL_FILTERING, false))
    {
      this.nwUrlFilters = new URLFilters(job);
    }
    
    if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false))
    {
      this.nwUrlNormalizers =
        new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
    }
     
    this.collectionType = job.get(Global.DATABASE_CONNECTION);    
         LOG.debug("Collection type: " + collectionType + ", requested key: " + Global.COLLECTION_TYPE);
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {    
      this.databaseConnection=job.get(Global.DATABASE_CONNECTION);
      this.databaseUsername=job.get(Global.DATABASE_USERNAME);
      this.databasePassword=job.get(Global.DATABASE_PASSWORD);
      
      try {                
        sqlsearcher=new SqlSearcher();    
        sqlsearcher.connect(databaseConnection,databaseUsername,databasePassword);        
      }
      catch (Exception e) {
        LOG.error("Error connecting to database: "+e.getMessage());      
            LOG.error("Error connecting to database: "+e.toString());
        sqlsearcher=null;
      }    
    }
  }


  public void map(WritableComparable key, Writable value,
    OutputCollector output, Reporter reporter)
    throws IOException
  {
    String collection = Nutchwax.getCollectionFromWaxKey(key);
    
    LOG.debug("Collection name is " + collection + " key: " + key);
    if (collection == null)
    {
      LOG.info("Collection is null in key -- skipping " + key);
    }
    
    String fromUrl = Nutchwax.getUrlFromWaxKey(key);
    String fromHost = getHost(fromUrl);


    if (this.nwUrlNormalizers != null)
    {
      try {
        fromUrl = this.nwUrlNormalizers.normalize(fromUrl, URLNormalizers.SCOPE_LINKDB);       
      }
      catch (Exception e) {
        LOG.warn("Skipping " + fromUrl + ":" + e);
        fromUrl = null;
      }
    }
    
    if (fromUrl != null && this.nwUrlFilters != null)
    {
      try {
        fromUrl = this.nwUrlFilters.filter(fromUrl);
      }
      catch (Exception e) {
        LOG.warn("Skipping " + fromUrl + ":" + e);
        fromUrl = null;
      }
    }
    
    if (fromUrl == null) { // discard all outlinks    
      return;
    }


    ParseData parseData = (ParseData)value;
    Outlink[] outlinks = parseData.getOutlinks();
    Inlinks inlinks = new Inlinks();
    
    String fromUrlCriginalColectionName=null; 
    String fromUrlTimestamp=null;
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {          
      fromUrlCriginalColectionName=SqlSearcher.getCollectionNameOriginal(collection);      
      fromUrlTimestamp=SqlSearcher.getTimestampOriginal(collection);      
    }
    
    for (int i = 0; i < outlinks.length; i++) {
      Outlink outlink = outlinks[i];
      String toUrl = outlink.getToUrl();
           
      if (this.nwIgnoreInternalLinks)
      {
        String toHost = getHost(toUrl);
        
        if (toHost == null || toHost.equals(fromHost)) { // internal link              
          continue;                               // skip it
        }
      }


      if (this.nwUrlNormalizers != null)
      {
        try {          
          toUrl = this.nwUrlNormalizers. normalize(toUrl, URLNormalizers.SCOPE_LINKDB);
        }
        catch (Exception e) {
          LOG.warn("Skipping " + toUrl + ":" + e);
          toUrl = null;
        }
      }
      
      if (toUrl != null && this.nwUrlFilters != null) {
        try {
          toUrl = this.nwUrlFilters.filter(toUrl); // filter the url
          if (toUrl==null) {  
            LOG.info("LINKDB URL FILTERED");  
          }
        }
        catch (Exception e) {
          LOG.warn("Skipping " + toUrl + ":" + e);
          toUrl = null;
        }
      }
    
      if (toUrl == null) {
        continue;
      }


      inlinks.clear();
    
      String anchor = outlink.getAnchor();        // truncate long anchors
       
      if (anchor.length() > this.nwMaxAnchorLength) {
        anchor = anchor.substring(0, this.nwMaxAnchorLength);
      }
 
      inlinks.add(new Inlink(fromUrl, anchor));   // collect inverted link      
      if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
        try {          
            String toUrlNearTimestamp = sqlsearcher.selectNearTimestamp(toUrl, fromUrlTimestamp);
        LOG.debug("LinkDB: toUrlNearTimestamp: " + toUrlNearTimestamp + " toUrl:" + toUrl + " fromUrlTimestamp " + fromUrlTimestamp);
          if (toUrlNearTimestamp!=null) {
                        
           String fromUrlNearTimestamp = sqlsearcher.selectNearTimestamp(fromUrl, toUrlNearTimestamp); // see if the 'from url' is the closest of this document
                 LOG.debug("LinkDB: fromUrlNearTimestamp: " + fromUrlNearTimestamp + " fromUrlTimestamp:" + fromUrlTimestamp + "fromURL: " + fromUrl);
           if (fromUrlNearTimestamp != null && fromUrlNearTimestamp.equals(fromUrlTimestamp)) { // if A is the closest of B and vice-versa -> 1-1 relation
             LOG.debug("LinkDB: from:"+key.toString()+" to:"+Nutchwax.generateWaxKey(toUrl, SqlSearcher.getCollectionNameWithTimestamp(fromUrlCriginalColectionName,toUrlNearTimestamp)));
               output.collect(Nutchwax.generateWaxKey(toUrl, SqlSearcher.getCollectionNameWithTimestamp(fromUrlCriginalColectionName,toUrlNearTimestamp)), inlinks);                             
           }           
              }
        } 
        catch (SQLException e) {          
          LOG.error("LinkDB error: "+e.getMessage()+", toUrl:"+toUrl+", fromUrlTimestamp:"+fromUrlTimestamp);          
        }        
      }
      else {
        output.collect(new Text(Nutchwax.generateWaxKey(toUrl, collection)), inlinks);
      }
    }
  }


  private String getHost(String url)
  {
    try
    {
      return new URL(url).getHost().toLowerCase();
    }
    catch (MalformedURLException e)
    {
      return null;
    }
  }


  public void invert(Path linkDb, final Path[] segments,
    final boolean normalize, final boolean filter, boolean force)
    throws IOException
  {
    Path lock = new Path(linkDb, LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);
    Path currentLinkDb = new Path(linkDb, CURRENT_NAME);
    
    if (LOG.isInfoEnabled())
    {
      LOG.info("NutchwaxLinkDb: starting");
      LOG.info("NutchwaxLinkDb: linkdb: " + linkDb);
      LOG.info("LinkDb: URL normalize: " + normalize);
      LOG.info("LinkDb: URL filter: " + filter);
    }
    
    JobConf job = createJob(getConf(), linkDb, normalize, filter);
    
    for (int i = 0; i < segments.length; i++)
    {
      if (LOG.isInfoEnabled())
      {
        LOG.info("LinkDb: adding segment: " + segments[i]);
      }
      
      job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
    }
    
    try
    {
      JobClient.runJob(job);
    }
    catch (IOException e)
    {
      LockUtil.removeLockFile(fs, lock);
      throw e;
    }
    
    if (fs.exists(currentLinkDb))
    {
      if (LOG.isInfoEnabled())
      {
        LOG.info("LinkDb: merging with existing linkdb: " + linkDb);
      }
      
      // try to merge
      Path newLinkDb = job.getOutputPath();
      job = LinkDb.createMergeJob(getConf(), linkDb, normalize, filter);
      job.setJobName("NutchwaxLinkDb merge " + linkDb + " " +
        Arrays.asList(segments));
      job.setMapperClass(NutchwaxLinkDbFilter.class);
      job.addInputPath(currentLinkDb);
      job.addInputPath(newLinkDb);
      
      try
      {
        JobClient.runJob(job);
      }
      catch (IOException e)
      {
        LockUtil.removeLockFile(fs, lock);
        fs.delete(newLinkDb);
        throw e;
      }
      
      fs.delete(newLinkDb);
    }
    
    LinkDb.install(job, linkDb);
    
    if (LOG.isInfoEnabled())
    {
      LOG.info("LinkDb: done");
    }
  }


  /**
   * Copied from parent because method is private there (Its public in
   * crawldb). Additions are on end just before return.
   * @param config
   * @param linkDb
   * @param normalize
   * @param filter
   * @return A jobconf.
   */
  private static JobConf createJob(Configuration config, Path linkDb,
    final boolean normalize, final boolean filter)
  {
    Path newLinkDb = new Path("linkdb-" +
      Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    JobConf job = new NutchJob(config);
    job.setJobName("linkdb " + linkDb);


    job.setInputFormat(SequenceFileInputFormat.class);


    job.setMapperClass(LinkDb.class);
    
    // if we don't run the mergeJob, perform normalization/filtering now
    if (normalize || filter)
    {
      try
      {
        FileSystem fs = FileSystem.get(config);
        
        if (!fs.exists(linkDb))
        {
          job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
          job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
        }
      }
      catch (Exception e)
      {
        LOG.warn("LinkDb createJob: " + e);
      }
    }
    
    job.setReducerClass(LinkDb.class);


    job.setOutputPath(newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);


    // Now do the NutchwaxLinkDb config. changing mapper -- we use LinkDb's
    // reducer -- and job name.
    job.setJobName("nutchwaxLinkdb " + linkDb);
    job.setMapperClass(NutchwaxLinkDb.class);


    return job;
  }


  public static void main(String[] args) throws Exception
  {
    int res = new NutchwaxLinkDb().
      doMain(NutchwaxConfiguration.getConfiguration(), args);
    
    System.exit(res);
  }
}
Source Code of org.archive.access.nutch.jobs.NutchwaxLinkDb

Related Classes of org.archive.access.nutch.jobs.NutchwaxLinkDb