Examples of TRECDocumentCollection


Examples of it.unimi.dsi.mg4j.document.TRECDocumentCollection

    OutputStream outputStream = new FileOutputStream(temp);
    IOUtils.copy(this.getClass().getResourceAsStream("testChar255.data"),
        outputStream);
    outputStream.close();

    TRECDocumentCollection collection = new TRECDocumentCollection(
        new String[] { temp.toString() },
        CompositeDocumentFactory
            .getFactory(new DocumentFactory[] {
                new TRECHeaderDocumentFactory(),
                new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),
        4, // Very small, to induce fragmentation
        false);

    try {
      DocumentIterator iter = collection.iterator();
      Document d;
      while ((d = iter.nextDocument()) != null)
        d.title();
    } catch (IllegalStateException e) {
      assertTrue(false);
View Full Code Here

Examples of it.unimi.dsi.mg4j.document.TRECDocumentCollection

    IOUtils.copy( this.getClass().getResourceAsStream( "testContents.data" ), outputStream );
    outputStream.close();
    IOUtils.copy( this.getClass().getResourceAsStream( "testContentsAgain.data" ), outputStreamAgain );
    outputStreamAgain.close();

    TRECDocumentCollection collection = new TRECDocumentCollection(
        new String[] { temp.toString(), tempAgain.toString() },
        CompositeDocumentFactory
            .getFactory(new DocumentFactory[] {
                new TRECHeaderDocumentFactory(),
                new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),
        4, // Very small, to induce fragmentation
        false);

    DocumentIterator iter = collection.iterator();
    Document d = null;

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());

    final int textIndex = collection.factory().fieldIndex( "text" );
   
    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
   
    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());

    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());

    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());

    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
   
    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());

    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());

    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());

    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNull(d);
    iter.close();
   
    d = collection.document( 0 );
    assertNotNull(d);
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());

    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();
   
    d = collection.document( 1 );
    assertNotNull(d);
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());

    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 2 );
    assertNotNull(d);
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());

    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 3 );
    assertNotNull(d);
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());

    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();
   
    d = collection.document( 4 );
    assertNotNull(d);
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());

    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 5 );
    assertNotNull(d);
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());

    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 6 );
    assertNotNull(d);
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());

    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );
View Full Code Here

Examples of net.bpiwowar.mg4j.extensions.trec.TRECDocumentCollection

            case "trec":
                Properties properties = new Properties();
                properties.setProperty(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8");
                final TRECDocumentFactory documentFactory = new TRECDocumentFactory(properties);

                collection = new TRECDocumentCollection(files,
                            documentFactory, SegmentedDocumentCollection.DEFAULT_BUFFER_SIZE, compression, metadataFile);
                break;

            case "warc/0.18":
                collection = new WARCDocumentCollection(files, SegmentedDocumentCollection.DEFAULT_BUFFER_SIZE, compression, metadataFile);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.