Package org.archive.crawler.util

Source Code of org.archive.crawler.util.BloomUriUniqFilterTest

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.crawler.util;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.logging.Logger;

import junit.framework.TestCase;

import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.UriUniqFilter;
import org.archive.modules.CrawlURI;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.BloomFilter64bit;


/**
* Test BloomUriUniqFilter.
* @author gojomo
*/
public class BloomUriUniqFilterTest extends TestCase
implements UriUniqFilter.CrawlUriReceiver {
    private Logger logger =
        Logger.getLogger(BloomUriUniqFilterTest.class.getName());

    private BloomUriUniqFilter filter = null;

    /**
     * Set to true if we visited received.
     */
    private boolean received = false;

    protected void setUp() throws Exception {
        super.setUp();
        this.filter = new BloomUriUniqFilter();
        this.filter.setBloomFilter(new BloomFilter64bit(2000, 24));
        this.filter.afterPropertiesSet();
        this.filter.setDestination(this);
    }

    public void testAdding() throws URIException {
        this.filter.add(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        this.filter.addNow(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        this.filter.addForce(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        // Should only have add 'this' once.
        assertTrue("Count is off", this.filter.count() == 1);
    }

    /**
     * Test inserting.
     * @throws URIException
     * @throws IOException
     * @throws FileNotFoundException
     */
    public void testWriting() throws URIException {
        long start = System.currentTimeMillis();
        ArrayList<UURI> list = new ArrayList<UURI>(1000);
        int count = 0;
        final int MAX_COUNT = 1000;
        for (; count < MAX_COUNT; count++) {
            assertEquals("count off",count,filter.count());
            UURI u = UURIFactory.getInstance("http://www" +
                    count + ".archive.org/" + count + "/index.html");
            assertFalse("already contained "+u.toString(),filter.bloom.contains(u.toString()));
            logger.fine("adding "+u.toString());
            filter.add(u.toString(), new CrawlURI(u));
            assertTrue("not in bloom",filter.bloom.contains(u.toString()));
            if (count > 0 && ((count % 100) == 0)) {
                list.add(u);
            }
        }
        logger.fine("Added " + count + " in " +
                (System.currentTimeMillis() - start));

        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        logger.fine("Readded subset " + list.size() + " in " +
                (System.currentTimeMillis() - start));

        assertTrue("Count is off: " + filter.count(),
            filter.count() == MAX_COUNT);
    }

    public void testNote() {
        filter.note(this.getUri());
        assertFalse("Receiver was called", this.received);
    }

// FORGET CURRENTLY UNSUPPORTED IN BloomUriUniqFilter
//    public void testForget() throws URIException {
//        this.filter.forget(this.getUri(),
//                new CrawlURI(UURIFactory.getInstance(this.getUri())));
//        assertTrue("Didn't forget", this.filter.count() == 0);
//    }

    public void receive(CrawlURI item) {
        this.received = true;
    }

    public String getUri() {
        return "http://www.archive.org";
    }
}
TOP

Related Classes of org.archive.crawler.util.BloomUriUniqFilterTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.