Package org.apache.uima.examples.casMultiplier

Source Code of org.apache.uima.examples.casMultiplier.SimpleTextSegmenter

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.examples.casMultiplier;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.AbstractCas;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

/**
* An example CasMultiplier, which breaks large text documents into smaller segments. The minimum
* size of the segments as determined by the "SegmentSize" configuration parameter, but the break
* between segments will always occur at the next newline character, so segments will not be exactly
* that size.
*/
public class SimpleTextSegmenter extends JCasMultiplier_ImplBase {
  private String mDoc;

  private int mPos;

  private int mSegmentSize;

  private String mDocUri;

  /*
   * (non-Javadoc)
   *
   * @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext)
   */
  public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);
    mSegmentSize = ((Integer) aContext.getConfigParameterValue("SegmentSize")).intValue();
  }

  /*
   * (non-Javadoc)
   *
   * @see JCasMultiplier_ImplBase#process(JCas)
   */
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    mDoc = aJCas.getDocumentText();
    mPos = 0;
    // retreive the filename of the input file from the CAS so that it can be added
    // to each segment
    FSIterator it = aJCas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
    if (it.hasNext()) {
      SourceDocumentInformation fileLoc = (SourceDocumentInformation) it.next();
      mDocUri = fileLoc.getUri();
    } else {
      mDocUri = null;
    }
  }

  /*
   * (non-Javadoc)
   *
   * @see org.apache.uima.analysis_component.AnalysisComponent#hasNext()
   */
  public boolean hasNext() throws AnalysisEngineProcessException {
    return mPos < mDoc.length();
  }

  /*
   * (non-Javadoc)
   *
   * @see org.apache.uima.analysis_component.AnalysisComponent#next()
   */
  public AbstractCas next() throws AnalysisEngineProcessException {
    int breakAt = mPos + mSegmentSize;
    if (breakAt > mDoc.length())
      breakAt = mDoc.length();
    // search for the next newline character. Note: this example segmenter implementation
    // assumes that the document contains many newlines. In the worst case, if this segmenter
    // is runon a document with no newlines, it will produce only one segment containing the
    // entire document text. A better implementation might specify a maximum segment size as
    // well as a minimum.
    while (breakAt < mDoc.length() && mDoc.charAt(breakAt - 1) != '\n')
      breakAt++;

    JCas jcas = getEmptyJCas();
    try {
      jcas.setDocumentText(mDoc.substring(mPos, breakAt));
      // if original CAS had SourceDocumentInformation, also add SourceDocumentInformatio
      // to each segment
      if (mDocUri != null) {
        SourceDocumentInformation sdi = new SourceDocumentInformation(jcas);
        sdi.setUri(mDocUri);
        sdi.setOffsetInSource(mPos);
        sdi.setDocumentSize(breakAt - mPos);
        sdi.addToIndexes();

        if (breakAt == mDoc.length()) {
          sdi.setLastSegment(true);
        }
      }

      mPos = breakAt;
      return jcas;
    } catch (Exception e) {
      jcas.release();
      throw new AnalysisEngineProcessException(e);
    }
  }

}
TOP

Related Classes of org.apache.uima.examples.casMultiplier.SimpleTextSegmenter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.