Package org.apache.access.nutch.utils

Source Code of org.apache.access.nutch.utils.UrlNormalizer

package org.apache.access.nutch.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.*;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;


/**
* Normalize URLs using nutchwax normalizers
* @author Miguel Costa
*/
public class UrlNormalizer extends ToolBase {

  private final static String TOKEN=",";
  private static URLNormalizers urlNormalizers; 
 
 
  public UrlNormalizer() {   
  }
 
  public String normalize(String url) throws IOException {
    if (urlNormalizers==null) {
      urlNormalizers=new URLNormalizers(getConf(),URLNormalizers.SCOPE_FETCHER);
    }
    return urlNormalizers.normalize(url,URLNormalizers.SCOPE_FETCHER);
  }
     
  public static void main(String[] args) throws Exception {
    if (args.length!=4) {
        System.out.println("arguments: <filename input> <filename output> <number fields> <index of field to normalize>");
        return;
    }
   
    int res = new UrlNormalizer().doMain(NutchConfiguration.create(),args);
    System.exit(res);
  }
   
  public int run(String[] args) throws Exception {
                   
    int nFields=Integer.parseInt(args[2]);
    int indexField=Integer.parseInt(args[3]);   
   
    System.out.println("Start normalizing "+args[0]+" file.");
       
    BufferedReader br = new BufferedReader(new FileReader(args[0]) );
    PrintWriter pw=new PrintWriter(new File(args[1]));
    String line; 
    String parts[]=null;
    StringBuffer sbuf=null;
    StringBuffer surl=null;

    while ( ( line = br.readLine() ) != null ) {       
      parts = line.split(TOKEN);
   
      /*
      if (parts.length!=nFields) {
        throw new IOException("ERROR: wrong line:"+line);
        System.out.println("Possible wrong line:"+line);
      } 
      */
     
      sbuf=new StringBuffer(500);
      surl=new StringBuffer(500);     
      try {
        for (int i=0;i<parts.length;i++) {
                   
          if (i==0) { // check data
           
          }
         
          if (i>=indexField && i<=parts.length-nFields+indexField) {
            if (i!=0) {
              if (i==indexField) {
                sbuf.append(TOKEN);
              }
              else {
                surl.append(TOKEN);
              }
            }
            surl.append(parts[i]);
            if (i==parts.length-nFields+indexField) {
              sbuf.append("\""+normalize(surl.substring(1,surl.length()-1))+"\"");
            }
          }
          else {
            if (i==0) {             
              // check date
              String sdate=parts[0].substring(1,parts[0].length()-1);           
              SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss")
              try {
                Date date = sdf.parse(sdate); // throw ParseException
              }
              catch (ParseException e) {       
                sdate+="00"; // for errors such as these "2000-11-19 16:28:"
                Date date = sdf.parse(sdate); // throw ParseException             
              }               
              if (sdate.indexOf("-00")!=-1) { // for errors such as these - date/time field value out of range: "2000-00-10 01:23:52"
                throw new ParseException("Wrong date in this version: "+line, 0);
              }
             
              sbuf.append("\""+sdate+"\"");
            }
            else {           
              sbuf.append(TOKEN);
              sbuf.append(parts[i]);
            }           
          }       
        } 

        pw.println(sbuf);
      }
      catch (MalformedURLException e) {
        System.out.println("MalformedURLException for url "+surl.substring(1,surl.length()-1));
      }         
      catch (ParseException e) {       
        System.out.println("Wrong date in this version: "+line);
      }   
    }
    br.close();
    pw.close();
    return 0
  }

}
TOP

Related Classes of org.apache.access.nutch.utils.UrlNormalizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.