Source Code of org.apache.lucene.search.caches.PwaBlacklistCache$ProcessThread

package org.apache.lucene.search.caches;


import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.Base32;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.BitSet;
import java.util.ArrayList;




/**
 * Identify and cache pages that should be discarded
 * @author Miguel Costa
 * 
 * @note this class can receive a list of errors and redirects from crawler or test all documents
 */
public class PwaBlacklistCache implements PwaICache {


  private final static String CACHE_FILENAME="blacklist.cache";
  //private final static String ERRORS_FILENAME="errors.urls";
  private final static int PAGE_CODE_NOT_FOUND=404;
  private final static int INDEX_ID=0;
  private final static int INC_LOG=100000;
  //private final static long MAX_TIMEOUT_THREAD=300000; // millisec until timeout of thread
  private static Object lockObj=new Object();  
  private static BitSet docBlackList=null; // blacklist of documents  
  private IndexReader reader;
  private Searcher searcher;  
  private ArrayList<Integer>[] idsThread; // array of vectors, with each vector containing the ids for each thread
    
  
  
  /**
   * Constructor   
   * @param reader index reader   
   * @param searcher index searcher
   * @param blacklistDir blacklist directory 
   * @throws IOException
   */
  public PwaBlacklistCache(IndexReader reader, Searcher searcher, File blacklistFile) throws IOException {
    this(reader, blacklistFile);
    this.searcher=searcher;
  }
  
  /**
   * Constructor
   * @param reader index reader
   * @param blacklistDir blacklist directory 
   * @throws IOException
   */
  public PwaBlacklistCache(IndexReader reader, File blacklistFile) throws IOException {    
    if (docBlackList!=null) {
      return;
    }


    // load cache once    
    synchronized(lockObj) {
      if (docBlackList!=null) {
        return;
      }
      this.reader=reader;      
      
      System.out.println("Loading blacklist to RAM at "+this.getClass().getSimpleName()+" class. The file is at "+blacklistFile.getAbsolutePath());      
      docBlackList=new BitSet(reader.maxDoc());  
                        
      if (blacklistFile==null) {
        String fileDir=reader.directory().toString().substring(reader.directory().toString().indexOf('@')+1);
        blacklistFile=new File(fileDir,CACHE_FILENAME);
      }  
      BufferedReader br = new BufferedReader(new FileReader(blacklistFile));          
      String line;
      int nfields=1;
      
      while ( ( line = br.readLine() ) != null ) {        
        String parts[] = line.split( "\\s" );      
        
        if (parts.length!=nfields) { 
          throw new IOException("ERROR: wrong number of fields.");
        }
              
        int doc=Integer.parseInt(parts[0]);
        docBlackList.set(doc,true);                              
      }      
      br.close();
      
      System.out.println("Loading blacklist to RAM at "+this.getClass().getSimpleName()+" class ended.");
    }      
  }
      
  /**
   * Get field name
   * @return field name
   */
  public String getFieldName() {
    return "blacklist";
  }
  
  /**
   * Get value from cache
   * @param ocument identifier
   * @return value from cache
   */
  public Object getValue(int doc) {
    return docBlackList.get(doc);
  }    
  
  /**
   * Indicates if the document is valid
   * @param doc document identifier
   * @return true if the document is valid, false otherwise
   */
  public boolean isValid(int doc) {
    return !docBlackList.get(doc);
  }
  
  
    
  /**   
   * Write a file with the ids of documents not archived
   * @param reader index reader
   * @param urlBase url to concat
   * @param nThreads number of threads
   * @param firstDoc first document
   * @param lastDoc last document
   * @param errorsFile errors filename with HTTP error and URL, or null
   * @throws IOException
   */
  public void writeCache(String urlBase, int nThreads, int firstDoc, int lastDoc, String errorsFile) throws IOException {          


    // identify the urls with problems
    identifyIdsFromUrls(nThreads,errorsFile);
    
    if (lastDoc==-1) {
      lastDoc=reader.maxDoc();
    }
    else {            
      if (lastDoc>reader.maxDoc()) {
        lastDoc=reader.maxDoc();
      }
    }
    docBlackList=new BitSet(reader.maxDoc()); // documents to eliminate    
    
    ProcessThread thr[]=new ProcessThread[nThreads];
    for (int i=0;i<nThreads;i++) { // for all threads process data          
      thr[i] = null;
      thr[i] = new ProcessThread(i,firstDoc,lastDoc,urlBase);
      thr[i].setPriority(Thread.MAX_PRIORITY);
      thr[i].start();
    }
    
    for (int i=0;i<nThreads;i++) { // for all threads process data          
      if (thr[i]!=null) {
        try {
          //thr[i].join(MAX_TIMEOUT_THREAD);
          thr[i].join();
          //thr[i].stop();          
        }
        catch (InterruptedException e) { 
          throw new IOException("ERROR: interrupt for thread "+(i+1)+".");
        }
        thr[i]=null;
      }
    }
    
      
    // save list to file
    writeList(firstDoc, lastDoc);
  }


  /**
   * Save blacklist to file
   * @param firstDoc first document to save
   * @param lasttDoc last document to save
   */
  private void writeList(int firstDoc, int lastDoc) throws IOException {
    String fileDir=reader.directory().toString().substring(reader.directory().toString().indexOf('@')+1);    
    PrintWriter pw=new PrintWriter(new File(fileDir,CACHE_FILENAME));
    for (int i=firstDoc;i<lastDoc;i++) {
      if (docBlackList.get(i)) {
        pw.println(""+i);
      }
    }
    pw.flush();
    pw.close();  
  }


  /**
   * Identify ids of documents with errors given a list of urls   
   * @param nThreads number of threads
   * @param errorsFile errors filename with HTTP error and URL, or null
   */
  private void identifyIdsFromUrls(int nThreads, String errorsFile) throws IOException {      
    idsThread=new ArrayList[nThreads];    
    for (int i=0;i<nThreads;i++) {
      idsThread[i]=new ArrayList<Integer>();
    }
                
    if (errorsFile!=null) { // set id for each url in error file
      BufferedReader br = new BufferedReader(new FileReader(new File(errorsFile)));
      String line;
      int nfields=2;    
      MessageDigest md = null;  
      try {
        md = MessageDigest.getInstance("MD5");
      }
      catch (NoSuchAlgorithmException e) {
        throw new IOException("Failed to get md5 digester: " + e.getMessage());
      }           
    
      while ( ( line = br.readLine() ) != null ) {        
        String parts[] = line.split( "\\s" );      
      
        if (parts.length!=nfields) { 
          throw new IOException("ERROR: wrong number of fields.");
        }
            
        int errorCode=Integer.parseInt(parts[0]);
        String url=parts[1];                      
        String encoded = Base32.encode(md.digest(url.getBytes()));
        Hits hits=searcher.search(new TermQuery(new Term("exacturl", encoded))); // index query          
        if (hits.length()>0) {
          idsThread[hits.id(0)%nThreads].add(hits.id(0));
        }
      
        System.out.println(errorCode+" "+hits.length()+" "+url+" "+(hits.length()>0 ? hits.id(0) : "")); // TODO remove
      }      
      br.close();
    }
    else { // set all ids
      int maxDoc=reader.maxDoc();
      for (int i=0;i<maxDoc;i++) {
        idsThread[i%nThreads].add(i);
      }    
    }
    
    for (int i=0;i<nThreads;i++) {
      System.out.println("thread "+i+"'s list size:"+idsThread[i].size());
    }
  }
  
  /**
   * Open url and get response code
   * @param url
   * @return response code
   * @throws IOException
   */
  public static int openUrlInputStream(URL url) throws IOException {
    int httpResponseCode = -1;
        URLConnection urlConnection = url.openConnection();        
        if(urlConnection instanceof HttpURLConnection) {
            HttpURLConnection httpUrlConnection = (HttpURLConnection)urlConnection;
            httpUrlConnection.setInstanceFollowRedirects(true);
            httpResponseCode = httpUrlConnection.getResponseCode();             
        }
        urlConnection.getInputStream().close();     
        return httpResponseCode;
    }


  
 
  
  /**
   * Main
   * @param args arguments
   */
  public static void main(String[] args) throws Exception {  
            
    String usage="usage: [index path] [url base] [number threads] [startDoc or 0(first)] [lastDoc exclusively or -1(all)] [errorsFile or nothing] \n e.g.: /data/arcs/outputsIAall/index http://t2.tomba.fccn.pt/wayback/wayback 20 0 -1 errors.urls";
        
    if (args.length!=5 && args.length!=6) {
      System.out.println(usage);
      System.exit(0);
    }
    
    Directory idx = FSDirectory.getDirectory(args[0], false);
    org.apache.lucene.index.IndexReader reader=IndexReader.open(idx);    
    org.apache.lucene.search.Searcher searcher = new IndexSearcher(idx);
    PwaBlacklistCache cache=new PwaBlacklistCache(reader,searcher,null);        
    cache.writeCache(args[1],Integer.parseInt(args[2]),Integer.parseInt(args[3]),Integer.parseInt(args[4]), args.length==6 ? args[5] : null);
    searcher.close();
    reader.close();        
  }
  
  
  
  
  /**
   * Process thread
   */
  class ProcessThread extends Thread {    
    private int id;
    private int count;
    private int thread;
    private int lastDoc;
    private String urlBase;
    private URL netUrl;    


    public ProcessThread(int thread, int firstDoc, int lastDoc, String urlBase) {
      this.count=0;
      this.thread=thread;
      this.lastDoc=lastDoc;
      this.urlBase=urlBase;
    }
    
    public void run() {      
      System.out.println("Started thread "+thread);
      
      int size=idsThread[thread].size();
      while (count<size) {              
        if (thread==0 && count%INC_LOG==0) {
          System.out.println("count: "+count);
        }
        
        try {
          id=idsThread[thread].get(count);
          netUrl = new URL(urlBase+"/"+"id"+id+"index"+INDEX_ID);                  
          int responseCode=openUrlInputStream(netUrl);        
          if (responseCode==PAGE_CODE_NOT_FOUND) {        
            docBlackList.set(id,true);
            System.err.println(""+id+" error 404");
          }
          else if (responseCode==-1) {
            System.err.println("-1: "+id);
          }                
        }
        catch (IOException ex) {
          docBlackList.set(id,true);
          System.err.println(""+id+" error:"+ex.getMessage());
        }    
        catch (RuntimeException ex) { // problem in response from the server
          docBlackList.set(id,true);
          System.err.println(""+id+" error:"+ex.getMessage());
        }
                
        count++;
      }
    }
  }
}
Source Code of org.apache.lucene.search.caches.PwaBlacklistCache$ProcessThread

Related Classes of org.apache.lucene.search.caches.PwaBlacklistCache$ProcessThread