Source Code of org.archive.access.nutch.jobs.NutchwaxPagerank$SortKeys

package org.archive.access.nutch.jobs;


import it.unimi.dsi.webgraph.*;


import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.regex.*;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.compress.*;
import org.apache.hadoop.io.IntWritable; 
import org.apache.hadoop.mapred.*;
import org.archive.access.nutch.NutchwaxConfiguration;
import org.archive.access.nutch.Nutchwax.OutputDirectories;
import org.archive.access.nutch.jobs.graph.ArcListASCIIGraphExt;
import org.archive.access.nutch.jobs.graph.GraphManager;
import org.archive.access.nutch.jobs.graph.Pagerank;
import org.archive.access.nutch.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.util.*;




/**
 * Computes Pagerank and writes it to HDFS
 * @author Miguel Costa
 */
public class NutchwaxPagerank 
{
  private final static boolean DEBUG=true;
  private final static String GRAPH_FILE="graphList";
  private final static String GRAPH_BV_FILE="graphBv";
  public static final String SCORES_FILE_NAME = "scores.txt";
  private static final Log LOG = LogFactory.getLog(NutchwaxPagerank.class);  
  private static final String EXCLUDE_PATTERNS[] = {"^c=.*,u=(file|ftp|mailto|gopher).*","^c=.*,u=.*(\\.gif|\\.bmp|\\.jpg|\\.jpeg|\\.png|\\.tif)$"};
    
  private Configuration conf;
  private FileSystem fs;  
  private Pattern excludePatterns[];
  private String collection;
  private boolean ignoreInternalLinks;
  
  
  /**
   * Constructor
   */
  public NutchwaxPagerank() throws IOException {      
    // initialize patterns
    excludePatterns=new Pattern[EXCLUDE_PATTERNS.length];
    for (int i=0;i<EXCLUDE_PATTERNS.length;i++) {
      excludePatterns[i]=Pattern.compile(EXCLUDE_PATTERNS[i]);
    }          
    
    this.conf = new JobConf(NutchwaxConfiguration.getConfiguration());
    this.fs = FileSystem.get(conf);
    this.collection=null;
  }


  /**
   * Constructor
   */
  public NutchwaxPagerank(Configuration conf) throws IOException {
    this();
    this.conf = conf;
    this.fs = FileSystem.get(conf);
    this.ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", true); 
  }
  
  /**
   * Get file system
   * @return
   */
  public FileSystem getFs() {
    return fs;
  }


  /**
   * Read links from LinkDB structure
   * @param inputSegments segments
   * @param processor injection method
   * @throws IOException
   */
  public void readLinks(Path inputSegments[], ReadLinksProcessor processor) throws IOException {
    
    Text key=new Text();
    ParseData value=new ParseData();
    Outlink[] outlinks=null;
    Path parsedataPath=null;  
    String toUrl=null;
      
    for (int j=0;j<inputSegments.length;j++) {
    
      if (!fs.isDirectory(inputSegments[j])) {
        throw new IOException("ERROR: "+inputSegments[j]+" is not a directory.");
      }
                          
      parsedataPath=new Path(inputSegments[j],ParseData.DIR_NAME);
      for (Path f : fs.listPaths(parsedataPath)) {
                        
        if (f.getName().startsWith("part-")) {        
                        
          LOG.info("reading dir "+f);
      
          MapFile.Reader reader=new MapFile.Reader(fs, (new Path(parsedataPath,f)).toString(), conf);
          //SequenceFile.Reader reader=new SequenceFile.Reader(fs, f, conf);
          while (reader.next(key,value)) {      
                      
            outlinks = value.getOutlinks();
            
            for (int i=0; i<outlinks.length; i++) {
              Outlink outlink = outlinks[i];


              if (collection==null) { // something like "c=gov1,u="
                collection=key.toString().substring(0,key.toString().indexOf(",u=")+3); 
              }              
              toUrl=collection+outlink.getToUrl();


              //ignoreInternalLinks=true; TODO remove
              boolean filterLink=false;
              if (ignoreInternalLinks) {
                String fromHost = getHost(key.toString());
                String toHost = getHost(toUrl);            
                if (toHost==null || fromHost==null || toHost.equals(fromHost)) { // internal link
                  filterLink=true;
                  LOG.info("pagerank filtered link: "+fromHost+" "+toHost);                  
                }
              }
                                          
              if (!filterLink && !filter(toUrl)) {                
                processor.run(key.toString(), toUrl);  // run abstract method     
              }
            }                        
          }  
          reader.close();
        }        
      }  
    }
  }
  
  /**
   * Write file graph
   * @param inputSegments
   * @param outputPath
   * @return number of nodes
   * @throws IOException
   */
  public int buildGraph(Path inputSegments[], Path outputPath) throws IOException {        
  
    final SequenceFile.Writer writer=SequenceFile.createWriter(fs, conf, new Path(outputPath,GRAPH_FILE), ArquivoWebKeyValueWritable.class, NullWritable.class, SequenceFile.CompressionType.BLOCK, new DefaultCodec());    
    final GraphManager graph = new GraphManager();  
    
    readLinks(inputSegments, new ReadLinksProcessor() {      
      public void run(String fromUrl, String toUrl) throws IOException {                
        writer.append(new ArquivoWebKeyValueWritable(graph.getId(fromUrl),graph.getId(toUrl)),NullWritable.get());        
      }
    });


    writer.close();
    return graph.numNodes();    
  }
  
  /**
   * Get host name
   * @param url url
   * @return
   */
  private String getHost(String url) {
    url=url.substring(url.indexOf(",u=")+3);
    try {
      return new URL(url).getHost().toLowerCase();
      }
      catch (MalformedURLException e) {
        return null;
      }
  }
  
  /**
   * Write file with pagerank scores   
   */
  public void writeFileScores(Path inputSegments[], Path outputFile, final double scores[]) throws IOException {
    
    final SequenceFile.Writer writer=SequenceFile.createWriter(fs, conf, outputFile, Text.class, FloatWritable.class, SequenceFile.CompressionType.BLOCK, new DefaultCodec());
    final GraphManager graph = new GraphManager();


    readLinks(inputSegments, new ReadLinksProcessor() {      
      public void run(String fromUrl, String toUrl) throws IOException {  // read urls in the same order when it created the web graph, to write scores
        
        int id;
        if (!graph.hasId(fromUrl)) {
          id=graph.getId(fromUrl);
          writer.append(new Text(fromUrl), new FloatWritable( (float)scores[id] ));                
        }
        if (!graph.hasId(toUrl)) {
          id=graph.getId(toUrl);
          writer.append(new Text(toUrl), new FloatWritable( (float)scores[id] ));                
        }
      }  
    });
      
    writer.close();
  }
  
  
  
  /**
   * Write text file with pagerank scores for debug
   */
  public void writeFileScores2debug(Path inputSegments[], Path outputFile, final double scores[]) throws IOException {
    
    FSDataOutputStream out = fs.create(outputFile);
    final PrintWriter writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out)));     
    final GraphManager graph = new GraphManager();
    
    readLinks(inputSegments, new ReadLinksProcessor() {    
      public void run(String fromUrl, String toUrl) throws IOException {  // read urls in the same order when it created the web graph, to write scores
        
        int id;
        if (!graph.hasId(fromUrl)) {
          id=graph.getId(fromUrl);
          writer.println(fromUrl+" "+scores[id]);                
        }
        if (!graph.hasId(toUrl)) {
          id=graph.getId(toUrl);
          writer.println(toUrl+" "+scores[id]);                
        }
      }
    });
      
    writer.close();
  }
  
  


  /**
   * Process scores
   * @path inputSegments input segments
   * @path output path
   */
  public void process(Path inputSegments[], Path outputPath) throws IOException {
    
    LOG.info("Starting ");
    Path graphUnorderedPath=new Path(outputPath+"-unorderedgraph");    
    int numNodes=buildGraph(inputSegments,graphUnorderedPath);
    LOG.info("Graph created in file "+GRAPH_FILE+" with "+numNodes+" nodes.");    
          
    // sort keys,values and remove duplicates
    Path graphOrderedPath=new Path(outputPath+"-orderedgraph");  
    JobConf job = createSortKeysJob(conf, graphOrderedPath);              
    job.addInputPath(graphUnorderedPath);
    try
    {      
      JobClient.runJob(job);      
    }
    catch (IOException e)
    {      
      throw e;
    }
    LOG.info("Graph keys sorted.");      
        
    // delete graphUnorderedPath since it is not necessary anymore 
    fs.delete(graphUnorderedPath);
    
    
    FSDataInputStream in = fs.open(new Path(graphOrderedPath,"part-00000"));    
    ArcListASCIIGraphExt graphAscii = ArcListASCIIGraphExt.loadOnce(in);
    graphAscii.setNumNodes(numNodes);            
      LOG.info("Text graph loaded to memory");    
    ImmutableGraph.store(BVGraph.class, graphAscii, /*(new Path(outputPath,*/GRAPH_BV_FILE/*)).toString()*/);  // TODO BUG store in local system. Webgraph does not support to store to an outputstream in API  
    LOG.info("Text graph stored compressed in file "+GRAPH_BV_FILE);
    graphAscii=null;
        
    // delete graphOrderedPath since it is not necessary anymore 
    fs.delete(graphOrderedPath);
    
    
    BVGraph graphBv=BVGraph.load(/*new Path(outputPath,*/GRAPH_BV_FILE/*)).toString()*/);
    LOG.info("Compressed graph loaded to memory");
    double scores[]=Pagerank.compute(graphBv);      
      LOG.info("Pagerank computed");      
      writeFileScores(inputSegments, new Path(outputPath,SCORES_FILE_NAME), scores);
      LOG.info("Scores written to file "+SCORES_FILE_NAME);
      
      // for debug
      if (DEBUG) {
        writeFileScores2debug(inputSegments, new Path(outputPath,SCORES_FILE_NAME+".debug"), scores);
        LOG.info("Scores to debugging written to file "+SCORES_FILE_NAME+".debug");
      }
      
      
      // delete graph files    
      File f=new File(GRAPH_BV_FILE+".graph");
      f.delete();
      f=new File(GRAPH_BV_FILE+".offsets");
      f.delete();
      f=new File(GRAPH_BV_FILE+".properties");
      f.delete();
  }


  /**
   * Main
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception
  {
      if (args.length != 1) {
          System.out.println("Usage : NutchwaxPagerank <outputs>");
        System.exit(1);
      }                    
  
      Nutchwax enclosureClass=new Nutchwax();
    Nutchwax.OutputDirectories od=enclosureClass.new OutputDirectories(new Path(args[0]));
      
      NutchwaxPagerank pgs=new NutchwaxPagerank();
      pgs.process(pgs.getFs().listPaths(od.getSegments()),od.getPagerank());        
  }
  
  
  /**
   * Filter link based on URL pattern
   * @param url
   * @return
   */
  private boolean filter(String url) {
          
    // test patterns
    for (int i=0;i<excludePatterns.length;i++) {
      Matcher matcher = excludePatterns[i].matcher(url);
      if (matcher!=null && matcher.matches()) {        
        return true;
      }
    }    
    return false;  
  }
  
  
  /**
   * Processor to pass to readLinks method
   */
  public interface ReadLinksProcessor {
    public void run(String fromUrl, String toUrl) throws IOException;
  }
  
  
  
  
  /**
   * 
   * Reduce class to sort int keys and remove duplicated arcs
   * 
   */    
  public static class SortKeys extends MapReduceBase implements Reducer {
            
    public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
      throws IOException {
      
      ArquivoWebKeyValueWritable keyvalue=(ArquivoWebKeyValueWritable)key;
      
      int prev=-1;
      while (values.hasNext()) { // equal keys with empty values
        if (prev==-1 || prev!=keyvalue.getValue()) { // remove duplicated arcs
          output.collect(new IntWritable(keyvalue.getKey()), new IntWritable(keyvalue.getValue()));
        }
        prev=keyvalue.getValue();
        values.next();        
      }
    }
  }  
  
  /**
   * Create job
   * @param config
   * @param outputPath
   * @return
   */    
  private JobConf createSortKeysJob(Configuration config, Path outputPath) {                
    JobConf job = new NutchJob(config);
    job.setJobName("sort key,values" + outputPath);


    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);    
    //job.setMapperClass(NutchwaxPagerank.class);   
    job.setReducerClass(SortKeys.class);         
    job.setOutputPath(outputPath);              
    job.setMapOutputKeyClass(ArquivoWebKeyValueWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setPartitionerClass(NutchwaxPagerankPartitioner.class); // to partition for one reducer only    
    //job.setOutputValueGroupingComparator(IntWritable.Comparator.class); // to sort values - NOT SUPPORTED IN THIS HADOOP VERSION
    
    return job;
  }  
  
}
Source Code of org.archive.access.nutch.jobs.NutchwaxPagerank$SortKeys

Related Classes of org.archive.access.nutch.jobs.NutchwaxPagerank$SortKeys