Package org.apache.ctakes.core.cr

Source Code of org.apache.ctakes.core.cr.LinesFromFileCollectionReader

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.core.cr;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.log4j.Logger;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
//import org.apache.uima.jcas.tcas.DocumentAnnotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

import org.apache.ctakes.typesystem.type.structured.DocumentID;

/**
*
* The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader
* and modified for Mayo use.
*
* This collection reader facilitates reading "documents" from a single file.  Each
* line in the document will be considered an entity to be analyzed by the CPE.  That
* is each line will be treated as a "document" and will have its own CAS.
*
* Extremely large files will require large memory resources as each line is read into
* memory upon initialization.  This was done to simplify implementation. 
*
* @author Philip V. Ogren
*
*/

public class LinesFromFileCollectionReader extends CollectionReader_ImplBase {

  /**
   * This parameter will be used the descriptor file to specify the location of the
   * file that will be run through this collection reader.
   */
  public static final String PARAM_INPUT_FILE_NAME = "InputFileName";
  /**
   * Optional parameter specifies a comment string.  Any line that begins with the string
   * will be ignored and not be added as a "document" to the CPE.
   */
  public static final String PARAM_COMMENT_STRING = "CommentString";
  /**
   * Optional parameter determines whether a blank line will be processed as a document or
   * will be ignored.  The default will be set to 'true'. 
   */
  public static final String PARAM_IGNORE_BLANK_LINES = "IgnoreBlankLines";
 
  /**
   * Name of optional configuration parameter that contains the language of
   * the documents in the input directory.  If specified this information will
   * be added to the CAS.
   */
    public static final String PARAM_LANGUAGE = "Language";
   
    /**
     * Name of optional configuration parameter that specifies a character (or string) that delimits
     * the id of the document from the text of the document.  For example, if the parameter is
     * set to '|' then the following line from a file:
     * <code>1234|this is some text</code>
     * would have an id of 1234 and text <code>this is some text</code>
     * If this parameter is not set, then
     * the id of a document will be its line number in the file.     
     */

    public static final String PARAM_ID_DELIMETER = "IdDelimeter";
   
  List<String> iv_linesFromFile;
  int iv_currentIndex = 0;
  String iv_language;
  String iv_delimeter;
 
  private Logger iv_logger = Logger.getLogger(getClass().getName());

  public void initialize() throws ResourceInitializationException
  {
    BufferedReader fileReader = null;
    try
    {
      String fileLocation = (String) getConfigParameterValue(PARAM_INPUT_FILE_NAME);
      String commentSeq = (String)getConfigParameterValue(PARAM_COMMENT_STRING);
      iv_language = (String)getConfigParameterValue(PARAM_LANGUAGE);
      Boolean paramValue = (Boolean)getConfigParameterValue(PARAM_IGNORE_BLANK_LINES);
      boolean ignoreBlankLines = true;
      if(paramValue != null)
      {
        ignoreBlankLines = paramValue.booleanValue();
      }
      iv_delimeter =  (String)getConfigParameterValue(PARAM_ID_DELIMETER);
       
      iv_linesFromFile = new ArrayList<String>();
      fileReader = new BufferedReader(new FileReader(fileLocation));
      String line;
      while((line = fileReader.readLine()) != null)
      {
        if(commentSeq != null)
        {
          if(line.startsWith(commentSeq)) continue;
        }
        if(ignoreBlankLines && line.trim().length() == 0) continue;
        iv_linesFromFile.add(line);
      }
    }
    catch(IOException fnfe)
    {
      throw new ResourceInitializationException(fnfe);
    }
    finally
    {
      if(fileReader != null)
      try { fileReader.close(); } catch(IOException ioe) {}
    }
  }
 
  public void getNext(CAS cas) throws IOException, CollectionException
  {
      JCas jcas;
      try
      {
        jcas = cas.getJCas();
     
        String line = (String) iv_linesFromFile.get(iv_currentIndex);
        int lineNumber = iv_currentIndex + 1;
        String id;
        String text;
        if(iv_delimeter != null)
      {
        int delimeterLoc = line.indexOf(iv_delimeter);
        if(delimeterLoc <= 0)
          throw new CollectionException(new Exception("Line in file number "+lineNumber+" is not well formatted.  " +
              "\nIt should have the format:" +
              "\n<doc_id>"+iv_delimeter+"<doc_text>"));
        id = line.substring(0,delimeterLoc);
        text = line.substring(delimeterLoc+iv_delimeter.length());
      }
        else
        {
          id = Integer.toString(lineNumber); //id will one more than its index into iv_linesFromFile (iv_currentIndex has already been incremented)
          text = line;
        }
       

        iv_logger.debug("id="+id);
        iv_logger.debug("text="+text);
       
      //if there's a CAS Initializer, call it 
      if (getCasInitializer() != null)
      {
        Reader reader = new StringReader(text);
        getCasInitializer().initializeCas(reader, cas);
        reader.close();
      }
      else  //No CAS Initiliazer, so read file and set document text ourselves
      {       
        jcas.setDocumentText(text);
      }
      
        //set language if it was explicitly specified as a configuration parameter
        if (iv_language != null)
        {
//          ((DocumentAnnotation)jcas.getDocumentAnnotationFs()).setLanguage(iv_language);
        }
       
       
        DocumentID documentIDAnnotation = new DocumentID(jcas);
        documentIDAnnotation.setDocumentID(id);
        documentIDAnnotation.addToIndexes();

      }
      catch (CASException e)
      {
        throw new CollectionException(e);
      }
      finally
      {
        iv_currentIndex++;
      }
     
  }

  public boolean hasNext() throws IOException, CollectionException
  {
    return iv_currentIndex < iv_linesFromFile.size();
  }

  public Progress[] getProgress() {
      return new Progress[]{
               new ProgressImpl(iv_currentIndex, iv_linesFromFile.size(),Progress.ENTITIES)};
  }

   /**
     * Gets the total number of documents that will be returned by this
     * collection reader. 
     * @return the number of documents in the collection
     */
    public int getNumberOfDocuments()
    {
      return iv_linesFromFile.size();
    }
   
  public void close() throws IOException {}
}
TOP

Related Classes of org.apache.ctakes.core.cr.LinesFromFileCollectionReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.