Package net.bpiwowar.mg4j.extensions.tasks

Source Code of net.bpiwowar.mg4j.extensions.tasks.BuildCollection$DocumentsConfiguration

package net.bpiwowar.mg4j.extensions.tasks;

import bpiwowar.argparser.Argument;
import bpiwowar.argparser.checkers.IOChecker;
import bpiwowar.experiments.AbstractTask;
import bpiwowar.experiments.TaskDescription;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.TypeAdapter;
import com.google.gson.annotations.JsonAdapter;
import com.google.gson.annotations.SerializedName;
import com.google.gson.stream.JsonReader;
import com.google.gson.stream.JsonToken;
import com.google.gson.stream.JsonWriter;
import it.unimi.di.big.mg4j.document.DocumentCollection;
import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.util.Properties;
import net.bpiwowar.mg4j.extensions.Compression;
import net.bpiwowar.mg4j.extensions.segmented.SegmentedDocumentCollection;
import net.bpiwowar.mg4j.extensions.trec.TRECDocumentCollection;
import net.bpiwowar.mg4j.extensions.trec.TRECDocumentFactory;
import net.bpiwowar.mg4j.extensions.warc.WARCDocumentCollection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.namespace.NamespaceContext;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;

/**
* Builds a collection
*
* @author B. Piwowarski <benjamin@bpiwowar.net>
*/
@TaskDescription(name = "build-collection", project = {"mg4j", "extensions"})
public class BuildCollection extends AbstractTask {
    final static private Logger LOGGER = LoggerFactory.getLogger(BuildCollection.class);
    public static final String IRCOLLECTIONS_NS = "http://ircollections.sourceforge.net";

    @Argument(name = "out", help = "The output file", required = true)
    File output;

    @Argument(name = "configuration", checkers = IOChecker.Readable.class, help = "XML configuration file from ir.collections. If not given, uses the standard input.")
    File configuration;


    public static class XPMPathAdapter extends TypeAdapter<String> {
        @Override
        public void write(JsonWriter jsonWriter, String s) throws IOException {

        }

        @Override
        public String read(JsonReader jsonReader) throws IOException {
            final JsonToken token = jsonReader.peek();
            String s = null;
            switch (token) {
                case BEGIN_OBJECT:
                    jsonReader.beginObject();
                    while (jsonReader.hasNext()) {
                        final String key = jsonReader.nextName();
                        if (!key.equals("$$value")) {
                            jsonReader.skipValue();
                        } else {
                            s = jsonReader.nextString();
                        }
                    }
                    jsonReader.endObject();
                    return s;
                case STRING:
                    return jsonReader.nextString();
                default:
                    throw new RuntimeException("cannot read");
            }

        }
    }

    static public class DocumentsConfiguration {
        @SerializedName("$compression")
        String compression;

        @SerializedName("$format")
        String format;

        @SerializedName("path")
        @JsonAdapter(XPMPathAdapter.class)
        public String path;
    }

    @Override
    public int execute() throws Throwable {
        // Create the file now
        OutputStream out = new FileOutputStream(output);

        // Read the IR collection file
        LOGGER.info("Reading the IR collections file");
        final InputStream stream = configuration != null ? new FileInputStream(configuration) : System.in;

        final Gson gson = new GsonBuilder()
                .create();
        final DocumentsConfiguration conf = gson.fromJson(new InputStreamReader(stream), DocumentsConfiguration.class);


        final Compression compression = Compression.fromString(conf.compression);
        LOGGER.info(String.format("Format is %s, compression  is %s", conf.format, compression));

        // Get the files
        LOGGER.info("Reading the file list");
        System.out.format("Document path: %s%n", conf.path);
        ArrayList<String> list = new ArrayList<>();
        final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(conf.path)));
        String s;
        while ((s = reader.readLine()) != null)
            list.add(s);
        String[] files = list.toArray(new String[list.size()]);
        Arrays.sort(files);


        final DocumentCollection collection;
        String docType = conf.format;
        File metadataFile = new File(output.getAbsolutePath() + ".metadata");

        switch (docType) {
            case "trec":
                Properties properties = new Properties();
                properties.setProperty(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8");
                final TRECDocumentFactory documentFactory = new TRECDocumentFactory(properties);

                collection = new TRECDocumentCollection(files,
                            documentFactory, SegmentedDocumentCollection.DEFAULT_BUFFER_SIZE, compression, metadataFile);
                break;

            case "warc/0.18":
                collection = new WARCDocumentCollection(files, SegmentedDocumentCollection.DEFAULT_BUFFER_SIZE, compression, metadataFile);
                break;
            default:
                LOGGER.error(String.format("Unknown document type [%s]", docType));
                System.exit(-1);
                throw new AssertionError();
        }

        // Store the collection
        BinIO.storeObject(collection, out);

        LOGGER.info("Found {} documents in the collection", collection.size());
        return 0;
    }

    private static class IRCNamespaces implements NamespaceContext {
        @Override
        public String getNamespaceURI(String s) {
            if (s.equals("irc"))
                return IRCOLLECTIONS_NS;
            return null;
        }

        @Override
        public String getPrefix(String s) {
            return null;
        }

        @Override
        public Iterator getPrefixes(String s) {
            return null;
        }
    }

}
TOP

Related Classes of net.bpiwowar.mg4j.extensions.tasks.BuildCollection$DocumentsConfiguration

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.