/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.core.cr;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
//import org.apache.uima.jcas.tcas.DocumentAnnotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.apache.ctakes.typesystem.type.structured.DocumentID;
/**
*
* The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader
* and modified for Mayo use.
*
* This collection reader facilitates reading "documents" from a single file. Each
* line in the document will be considered an entity to be analyzed by the CPE. That
* is each line will be treated as a "document" and will have its own CAS.
*
* Extremely large files will require large memory resources as each line is read into
* memory upon initialization. This was done to simplify implementation.
*
* @author Philip V. Ogren
*
*/
public class LinesFromFileCollectionReader extends CollectionReader_ImplBase {
/**
* This parameter will be used the descriptor file to specify the location of the
* file that will be run through this collection reader.
*/
public static final String PARAM_INPUT_FILE_NAME = "InputFileName";
/**
* Optional parameter specifies a comment string. Any line that begins with the string
* will be ignored and not be added as a "document" to the CPE.
*/
public static final String PARAM_COMMENT_STRING = "CommentString";
/**
* Optional parameter determines whether a blank line will be processed as a document or
* will be ignored. The default will be set to 'true'.
*/
public static final String PARAM_IGNORE_BLANK_LINES = "IgnoreBlankLines";
/**
* Name of optional configuration parameter that contains the language of
* the documents in the input directory. If specified this information will
* be added to the CAS.
*/
public static final String PARAM_LANGUAGE = "Language";
/**
* Name of optional configuration parameter that specifies a character (or string) that delimits
* the id of the document from the text of the document. For example, if the parameter is
* set to '|' then the following line from a file:
* <code>1234|this is some text</code>
* would have an id of 1234 and text <code>this is some text</code>.
* If this parameter is not set, then
* the id of a document will be its line number in the file.
*/
public static final String PARAM_ID_DELIMETER = "IdDelimeter";
List<String> iv_linesFromFile;
int iv_currentIndex = 0;
String iv_language;
String iv_delimeter;
private Logger iv_logger = Logger.getLogger(getClass().getName());
public void initialize() throws ResourceInitializationException
{
BufferedReader fileReader = null;
try
{
String fileLocation = (String) getConfigParameterValue(PARAM_INPUT_FILE_NAME);
String commentSeq = (String)getConfigParameterValue(PARAM_COMMENT_STRING);
iv_language = (String)getConfigParameterValue(PARAM_LANGUAGE);
Boolean paramValue = (Boolean)getConfigParameterValue(PARAM_IGNORE_BLANK_LINES);
boolean ignoreBlankLines = true;
if(paramValue != null)
{
ignoreBlankLines = paramValue.booleanValue();
}
iv_delimeter = (String)getConfigParameterValue(PARAM_ID_DELIMETER);
iv_linesFromFile = new ArrayList<String>();
fileReader = new BufferedReader(new FileReader(fileLocation));
String line;
while((line = fileReader.readLine()) != null)
{
if(commentSeq != null)
{
if(line.startsWith(commentSeq)) continue;
}
if(ignoreBlankLines && line.trim().length() == 0) continue;
iv_linesFromFile.add(line);
}
}
catch(IOException fnfe)
{
throw new ResourceInitializationException(fnfe);
}
finally
{
if(fileReader != null)
try { fileReader.close(); } catch(IOException ioe) {}
}
}
public void getNext(CAS cas) throws IOException, CollectionException
{
JCas jcas;
try
{
jcas = cas.getJCas();
String line = (String) iv_linesFromFile.get(iv_currentIndex);
int lineNumber = iv_currentIndex + 1;
String id;
String text;
if(iv_delimeter != null)
{
int delimeterLoc = line.indexOf(iv_delimeter);
if(delimeterLoc <= 0)
throw new CollectionException(new Exception("Line in file number "+lineNumber+" is not well formatted. " +
"\nIt should have the format:" +
"\n<doc_id>"+iv_delimeter+"<doc_text>"));
id = line.substring(0,delimeterLoc);
text = line.substring(delimeterLoc+iv_delimeter.length());
}
else
{
id = Integer.toString(lineNumber); //id will one more than its index into iv_linesFromFile (iv_currentIndex has already been incremented)
text = line;
}
iv_logger.debug("id="+id);
iv_logger.debug("text="+text);
//if there's a CAS Initializer, call it
if (getCasInitializer() != null)
{
Reader reader = new StringReader(text);
getCasInitializer().initializeCas(reader, cas);
reader.close();
}
else //No CAS Initiliazer, so read file and set document text ourselves
{
jcas.setDocumentText(text);
}
//set language if it was explicitly specified as a configuration parameter
if (iv_language != null)
{
// ((DocumentAnnotation)jcas.getDocumentAnnotationFs()).setLanguage(iv_language);
}
DocumentID documentIDAnnotation = new DocumentID(jcas);
documentIDAnnotation.setDocumentID(id);
documentIDAnnotation.addToIndexes();
}
catch (CASException e)
{
throw new CollectionException(e);
}
finally
{
iv_currentIndex++;
}
}
public boolean hasNext() throws IOException, CollectionException
{
return iv_currentIndex < iv_linesFromFile.size();
}
public Progress[] getProgress() {
return new Progress[]{
new ProgressImpl(iv_currentIndex, iv_linesFromFile.size(),Progress.ENTITIES)};
}
/**
* Gets the total number of documents that will be returned by this
* collection reader.
* @return the number of documents in the collection
*/
public int getNumberOfDocuments()
{
return iv_linesFromFile.size();
}
public void close() throws IOException {}
}