Package it.unimi.dsi.mg4j.document

Source Code of it.unimi.dsi.mg4j.document.TRECDocumentCollectionTest

package it.unimi.dsi.mg4j.document;

import it.unimi.dsi.mg4j.document.CompositeDocumentFactory;
import it.unimi.dsi.mg4j.document.Document;
import it.unimi.dsi.mg4j.document.DocumentFactory;
import it.unimi.dsi.mg4j.document.DocumentIterator;
import it.unimi.dsi.mg4j.document.HtmlDocumentFactory;
import it.unimi.dsi.mg4j.document.TRECDocumentCollection;
import it.unimi.dsi.mg4j.document.TRECHeaderDocumentFactory;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.Reader;

import junit.framework.TestCase;

import org.apache.commons.io.IOUtils;

public class TRECDocumentCollectionTest extends TestCase {

  public void testChar255() throws Exception {
    File temp = File.createTempFile(TRECDocumentCollectionTest.class
        .getName(), ".testChar255");

    temp.deleteOnExit();
    OutputStream outputStream = new FileOutputStream(temp);
    IOUtils.copy(this.getClass().getResourceAsStream("testChar255.data"),
        outputStream);
    outputStream.close();

    TRECDocumentCollection collection = new TRECDocumentCollection(
        new String[] { temp.toString() },
        CompositeDocumentFactory
            .getFactory(new DocumentFactory[] {
                new TRECHeaderDocumentFactory(),
                new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),
        4, // Very small, to induce fragmentation
        false);

    try {
      DocumentIterator iter = collection.iterator();
      Document d;
      while ((d = iter.nextDocument()) != null)
        d.title();
    } catch (IllegalStateException e) {
      assertTrue(false);
    }

  }

  public void testContents() throws Exception {
    File temp = File.createTempFile( TRECDocumentCollectionTest.class.getName(), ".testContents" );
    File tempAgain = File.createTempFile( TRECDocumentCollectionTest.class.getName(), ".testContentsAgain" );

    temp.deleteOnExit();
    tempAgain.deleteOnExit();
    OutputStream outputStream = new FileOutputStream( temp );
    OutputStream outputStreamAgain = new FileOutputStream( tempAgain );
    IOUtils.copy( this.getClass().getResourceAsStream( "testContents.data" ), outputStream );
    outputStream.close();
    IOUtils.copy( this.getClass().getResourceAsStream( "testContentsAgain.data" ), outputStreamAgain );
    outputStreamAgain.close();

    TRECDocumentCollection collection = new TRECDocumentCollection(
        new String[] { temp.toString(), tempAgain.toString() },
        CompositeDocumentFactory
            .getFactory(new DocumentFactory[] {
                new TRECHeaderDocumentFactory(),
                new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),
        4, // Very small, to induce fragmentation
        false);

    DocumentIterator iter = collection.iterator();
    Document d = null;

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());

    final int textIndex = collection.factory().fieldIndex( "text" );
   
    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
   
    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());

    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());

    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());

    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
   
    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());

    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());

    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());

    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNull(d);
    iter.close();
   
    d = collection.document( 0 );
    assertNotNull(d);
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());

    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();
   
    d = collection.document( 1 );
    assertNotNull(d);
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());

    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 2 );
    assertNotNull(d);
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());

    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 3 );
    assertNotNull(d);
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());

    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();
   
    d = collection.document( 4 );
    assertNotNull(d);
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());

    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 5 );
    assertNotNull(d);
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());

    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 6 );
    assertNotNull(d);
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());

    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

  }
}
TOP

Related Classes of it.unimi.dsi.mg4j.document.TRECDocumentCollectionTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.