Package org.apache.uima.ruta.textruler.core

Source Code of org.apache.uima.ruta.textruler.core.TextRulerExampleDocumentSet

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.ruta.textruler.core;

import java.io.File;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;

/**
*
* TextRulerExampleDocumentSet encapsulates an input set of documents, e.g. examples for a learning
* algorithm. It creates an instance of TextRulerExampleDocument for each found XMI file of the
* passed input folder
*
* For loading CASes you have to provide an CasCache. If you use TextRulerBasicLearner, this is done
* for you automatically.
*
*         hint: this could be renamed to MLDocumentSet instead of TextRulerExampleDocumentSet ?
*/
public class TextRulerExampleDocumentSet {

  protected List<TextRulerExampleDocument> documents;

  protected CasCache casCache;

  public TextRulerExampleDocumentSet(String xmiFolderName, CasCache casCache) {
    super();
    documents = new ArrayList<TextRulerExampleDocument>();
    this.casCache = casCache;
    File trainingFolder = new File(xmiFolderName);
    File[] files = trainingFolder.listFiles(new FilenameFilter() {
      public boolean accept(File dir, String name) {
        return (name.endsWith(".xmi"));
      }
    });

    for (File file : files) {
      TextRulerToolkit.log("found document XMI file: " + file.getName());
      documents.add(new TextRulerExampleDocument(file.getAbsolutePath(), casCache));
    }
  }

  // for subset creations:
  protected TextRulerExampleDocumentSet(String[] inputXmiFiles, CasCache casCache) {
    super();
    this.casCache = casCache;
    documents = new ArrayList<TextRulerExampleDocument>();
    for (String fileName : inputXmiFiles)
      documents.add(new TextRulerExampleDocument(fileName, casCache));
  }

  public void createExamplesForTarget(TextRulerTarget target) {
    TextRulerExampleDocument[] sortedDocs = getSortedDocumentsInCacheOptimizedOrder();
    for (TextRulerExampleDocument doc : sortedDocs) {
      doc.createExamplesForTarget(target);
    }
  }

  public void clearCurrentExamples() {
    for (TextRulerExampleDocument doc : documents)
      doc.clearCurrentExamples();
  }

  public Collection<CAS> getCachedCASes() {
    return casCache.getCachedCASes();
  }

  public boolean casCacheContainsKey(String key) {
    return casCache.containsElementWithKey(key);
  }

  public List<TextRulerExample> getAllExamples() {
    return getAllExamples(false);
  }

  public List<TextRulerExample> getAllPositiveExamples() {
    return getAllExamples(true);
  }

  public List<TextRulerExample> getAllExamples(boolean onlyPositives) {
    List<TextRulerExample> result = new ArrayList<TextRulerExample>();
    for (TextRulerExampleDocument doc : documents) {
      result.addAll(doc.getPositiveExamples());
      if (!onlyPositives)
        result.addAll(doc.getNegativeExamples());
    }
    return result;
  }

  public List<TextRulerExampleDocument> getDocuments() {
    return documents;
  }

  public TextRulerExampleDocument[] getSortedDocumentsInCacheOptimizedOrder(
          Collection<TextRulerExampleDocument> documents) {
    Set<TextRulerExampleDocument> docsLeft = new HashSet<TextRulerExampleDocument>(documents);
    TextRulerExampleDocument[] sortedDocs = new TextRulerExampleDocument[documents.size()];

    // "sort" the currently cached documents to the front of the document
    // list, so that
    // we can use them directly and do not have to reload all docs everytime
    // we come here!
    int i = 0;
    for (TextRulerExampleDocument doc : documents) {
      if (casCacheContainsKey(doc.getCasFileName())) {
        docsLeft.remove(doc);
        sortedDocs[i] = doc;
        i++;
      }
    }
    for (TextRulerExampleDocument doc : docsLeft) {
      sortedDocs[i] = doc;
      i++;
    }
    if (TextRulerToolkit.DEBUG) {
      TextRulerToolkit.logIf(i != documents.size(), "ERROR, SIZE MISMATCH!");
    }

    return sortedDocs;
  }

  public TextRulerExampleDocument[] getSortedDocumentsInCacheOptimizedOrder() {
    return getSortedDocumentsInCacheOptimizedOrder(documents);
  }

  public List<Integer> getTokenCountHistogrammForSlotName(String slotName, Set<String> filterSet) {
    HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
    int maxLen = 0;

    TextRulerExampleDocument[] sortedDocs = getSortedDocumentsInCacheOptimizedOrder(documents);

    for (TextRulerExampleDocument doc : sortedDocs) {
      CAS aCas = doc.getCAS();
      List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(aCas, slotName);
      TypeSystem ts = aCas.getTypeSystem();
      for (AnnotationFS a : slots) {

        List<AnnotationFS> slotTokens = TextRulerToolkit.getAnnotationsWithinBounds(aCas, a
                .getBegin(), a.getEnd(), TextRulerToolkit.getFilterSetWithSlotName(slotName,
                filterSet), ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME));
        int len = slotTokens.size();
        if (len > maxLen)
          maxLen = len;
        Integer key = new Integer(len);
        int current = map.containsKey(key) ? map.get(key) : 0;
        map.put(key, len + current);
      }
    }
    List<Integer> resultList = new ArrayList<Integer>(maxLen + 1);
    for (int i = 0; i <= maxLen; i++) {
      int value = map.containsKey(i) ? map.get(i) : 0;
      resultList.add(value);
    }
    return resultList;
  }

  public CAS getCAS(String key) {
    return casCache.getCAS(key);
  }

  public int size() {
    return documents.size();
  }

  public TextRulerExampleDocument getDocumentForFileName(String fileName) {
    for (TextRulerExampleDocument doc : documents)
      if (doc.getCasFileName().equals(fileName))
        return doc;
    return null;
  }

  // TODO this is not tested yet!
  public List<TextRulerExampleDocumentSet> partitionIntoSubsets(int[] percentages) {
    List<TextRulerExampleDocumentSet> result = new ArrayList<TextRulerExampleDocumentSet>();

    int sum = 0;
    for (int p : percentages) {
      if (p == 0) {
        TextRulerToolkit
                .log("[TextRulerExampleDocumentSet.partitionIntoSubsets] a percentage must not be zero!");
        return null;
      }
      sum += p;
    }
    if (sum != 100) {
      TextRulerToolkit
              .log("[TextRulerExampleDocumentSet.partitionIntoSubsets] percentages has to be 100 in total!");
      return null;
    }

    int rest = size();
    int docIndex = 0;

    for (int i = 0; i < percentages.length; i++) {
      int partSize;
      if (i == percentages.length - 1) {
        partSize = Math.round((((percentages[i] * size()) / 100.0f)));
        if (partSize == 0)
          partSize = 1;
      } else
        partSize = rest;

      if (partSize == 0) {
        TextRulerToolkit
                .log("[TextRulerExampleDocumentSet.partitionIntoSubsets] a percentage must not be zero! too few example documents for your partition?");
        return null;
      }
      String[] fileNames = new String[partSize];
      for (int doc = 0; doc < partSize; doc++)
        fileNames[doc] = documents.get(doc + docIndex).getCasFileName();
      docIndex += partSize;
      result.add(new TextRulerExampleDocumentSet(fileNames, casCache));
      rest -= partSize;
    }
    return result;
  }

}
TOP

Related Classes of org.apache.uima.ruta.textruler.core.TextRulerExampleDocumentSet

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.