Package org.archive.crawler.util

Source Code of org.archive.crawler.util.BenchmarkUriUniqFilters

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.crawler.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

import org.archive.crawler.datamodel.UriUniqFilter;
import org.archive.modules.CrawlURI;
import org.archive.util.fingerprint.MemLongFPSet;


/**
* BenchmarkUriUniqFilters
*
* @author gojomo
*/
public class BenchmarkUriUniqFilters implements UriUniqFilter.CrawlUriReceiver {
//    private Logger LOGGER =
//        Logger.getLogger(BenchmarkUriUniqFilters.class.getName());
   
    private BufferedWriter out; // optional to dump uniq items
    protected String current; // current line/URI being checked
   
    /**
     * Test the UriUniqFilter implementation (MemUriUniqFilter,
     * BloomUriUniqFilter, or BdbUriUniqFilter) named in first
     * argument against the file of one-per-line URIs named
     * in the second argument.
     *
     * @param args from cmd-line
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
        (new BenchmarkUriUniqFilters()).instanceMain(args);
    }
   
    public void instanceMain(String[] args) throws IOException {
        String testClass = args[0];
        String inputFilename = args[1];
        long start = System.currentTimeMillis();
        UriUniqFilter uniq = createUriUniqFilter(testClass);
        long created = System.currentTimeMillis();
        BufferedReader br = new BufferedReader(new FileReader(inputFilename));
        if(args.length>2) {
            String outputFilename = args[2];
            out = new BufferedWriter(new FileWriter(outputFilename));
        }
        int added = 0;
        while((current=br.readLine())!=null) {
            added++;
            uniq.add(current,null);
        }
        uniq.close();
        long finished = System.currentTimeMillis();
        if(out!=null) {
            out.close();
        }
        System.out.println(added+" adds");
        System.out.println(uniq.count()+" retained");
        System.out.println((created-start)+"ms to setup UUF");
        System.out.println((finished-created)+"ms to perform all adds");
    }
   
    private UriUniqFilter createUriUniqFilter(String testClass) throws IOException {
        UriUniqFilter uniq = null;
        if(BdbUriUniqFilter.class.getName().endsWith(testClass)) {;
            // BDB setup
            File tmpDir = File.createTempFile("uuf","benchmark");
            tmpDir.delete();
            tmpDir.mkdir();
            uniq = new BdbUriUniqFilter(tmpDir, 50);
        } else if(BloomUriUniqFilter.class.getName().endsWith(testClass)) {
            // bloom setup
            uniq = new BloomUriUniqFilter();
        } else if(MemUriUniqFilter.class.getName().endsWith(testClass)) {
            // mem hashset
            uniq = new MemUriUniqFilter();
        } else if (FPUriUniqFilter.class.getName().endsWith(testClass)) {
            // mem fp set (open-addressing) setup
            uniq = new FPUriUniqFilter(new MemLongFPSet(21,0.75f));
        }
        uniq.setDestination(this);
        return uniq;
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver#receive(org.archive.crawler.datamodel.CrawlURI)
     */
    public void receive(CrawlURI item) {
        if(out!=null) {
            try {
                // we assume all tested filters are immediate passthrough so
                // we can use 'current'; a buffering filter would change this
                // assumption
                out.write(current);
                out.write("\n");
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }
}
TOP

Related Classes of org.archive.crawler.util.BenchmarkUriUniqFilters

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.