Package it.unimi.di.big.mg4j.document

Examples of it.unimi.di.big.mg4j.document.DocumentCollection


            list.add(s);
        String[] files = list.toArray(new String[list.size()]);
        Arrays.sort(files);


        final DocumentCollection collection;
        String docType = conf.format;
        File metadataFile = new File(output.getAbsolutePath() + ".metadata");

        switch (docType) {
            case "trec":
                Properties properties = new Properties();
                properties.setProperty(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8");
                final TRECDocumentFactory documentFactory = new TRECDocumentFactory(properties);

                collection = new TRECDocumentCollection(files,
                            documentFactory, SegmentedDocumentCollection.DEFAULT_BUFFER_SIZE, compression, metadataFile);
                break;

            case "warc/0.18":
                collection = new WARCDocumentCollection(files, SegmentedDocumentCollection.DEFAULT_BUFFER_SIZE, compression, metadataFile);
                break;
            default:
                LOGGER.error(String.format("Unknown document type [%s]", docType));
                System.exit(-1);
                throw new AssertionError();
        }

        // Store the collection
        BinIO.storeObject(collection, out);

        LOGGER.info("Found {} documents in the collection", collection.size());
        return 0;
    }
View Full Code Here


    }

  @Override
  public int execute() throws Throwable {
        final DocumentCollection collection = collectionCf.init();
        index.init();

        // Read model & topics
        logger.info("Reading model");
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        dbFactory.setNamespaceAware(true);
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        org.w3c.dom.Document xml = taskFile == null  ? dBuilder.parse(System.in) : dBuilder.parse(new FileInputStream(taskFile));

        JAXBContext context = JAXBContext.newInstance(BM25.class);
        Unmarshaller um = context.createUnmarshaller();

        RetrievalModel model = null;
        QuerySet querySet = null;

        for(Element child: XMLUtils.elements(xml.getDocumentElement().getChildNodes())) {
            if (XMLUtils.is(child, ADHOC_MODEL)) {
                for(Element grandchild: XMLUtils.elements(child.getChildNodes())) {
                    model = (RetrievalModel) um.unmarshal(grandchild);
                    break;
                }
            }
            if (XMLUtils.is(child, TOPICS)) {
                final String type = child.getAttribute("type");
                switch(type) {
                    case "trec":
                        try(BufferedReader reader = new BufferedReader(new FileReader(child.getAttribute("path")))) {
                            querySet = TRECTopic.readTopics(reader, false);
                        }
                        break;
                    default:
                        throw new RuntimeException(String.format("Cannot handle topics of type %s", type));
                }
                // Do something
            }
        }

        if (model == null)
            throw new IllegalArgumentException("No model was present in the XML description file");
        if (querySet == null)
            throw new IllegalArgumentException("No topics were present in the XML description file");

        logger.info(String.format("Starting with model [%s] and %d topics", model, querySet.queries().size()));

        // Dicarded documents
    TRECJudgments discarded = discardedQRELFile == null ? null
        : new TRECJudgments(discardedQRELFile);



        // Queries
    Set<String> topicIds = GenericHelper.newHashSet();
    Map<String, ? extends Topic> topics = querySet.queries();
    for (String id : topics.keySet()) {
      logger.debug(new LazyString("Considering topic %s (%b/%b/%b)", id, topics.keySet()
          .contains(id), onlyTopics.isEmpty(), onlyTopics
          .contains(id)));
      if (topics.keySet().contains(id)
          && (onlyTopics.isEmpty() || onlyTopics.contains(id))) {
        topicIds.add(id);
      }
    }

    if (topicIds.isEmpty()) {
      logger.error("No topics to be answered");
      return 1;
    }

    // Iterates on topics
    timer.start();
    TaskTimer.Task task = timer.new Task("Answering topics", "topics",
        topicIds.size());
    PrintStream output = System.out;

    model.init(collection, index);
    int totalRetrieved = 0;
    for (String topicId : topicIds) {
      logger.info(String.format("Answering topic %s", topicId));

      Topic topic = topics.get(topicId);
      ObjectArrayList<DocumentScoreInfo<Reference2ObjectMap<Index, SelectedInterval[]>>> results = new ObjectArrayList<>();

      Set<String> discardedDocuments = null;
      if (discarded != null) {
        Map<String, Integer> map = discarded.get(topicId);
        if (map != null)
          discardedDocuments = map.keySet();

      }

      // Ask for results (add some documents in case we discard some
      // after)
      model.process(topic, results,
          capacity
              + (discardedDocuments == null ? 0
                  : discardedDocuments.size()), timer);

      final int retrieved = results.size();
      totalRetrieved += retrieved;
      logger.info(String.format("Returned %d results", retrieved));
      int added = 0;
      for (int i = 0; i < retrieved && added < capacity; i++) {
        DocumentScoreInfo dsi = results.get(i);
        Document document = collection.document(dsi.document);
                System.err.println("URI: " + document.uri());
                System.err.println("URI["+dsi.document+"]: " + collection.metadata(dsi.document).get(PropertyBasedDocumentFactory.MetadataKeys.URI));
        final String docno = (String) collection.metadata(dsi.document).get(PropertyBasedDocumentFactory.MetadataKeys.URI);

        // If it was not a discarded document
        if (discardedDocuments == null
            || !discardedDocuments.contains(docno)) {
          output.format("%s Q0 %s %d %g %s%n", topicId, docno, i,
View Full Code Here

TOP

Related Classes of it.unimi.di.big.mg4j.document.DocumentCollection

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.