Package org.apache.uima.analysis_engine.impl

Source Code of org.apache.uima.analysis_engine.impl.NewlineResegmenter

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.analysis_engine.impl;

import java.util.Iterator;

import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.AbstractCas;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;

/**
*
*/
public class NewlineResegmenter extends JCasMultiplier_ImplBase {
  JCas mCurrentInputCas;

  String mDoc;

  int mCurIndex = 0;

  StringBuffer mBuf = new StringBuffer();

  JCas[] mJCases = new JCas[2];

  int mActiveJCas = 0;

  boolean mHasNext;

  /*
   * (non-Javadoc)
   *
   * @see org.apache.uima.analysis_component.CasSegmenter_ImplBase#process(org.apache.uima.cas.CAS)
   */
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    mCurrentInputCas = aJCas;
    mDoc = aJCas.getDocumentText();
    readToNextNewline();
  }

  /*
   * (non-Javadoc)
   *
   * @see org.apache.uima.analysis_component.AnalysisComponent#hasNext()
   */
  public boolean hasNext() throws AnalysisEngineProcessException {
    return mHasNext;
  }

  /*
   * (non-Javadoc)
   *
   * @see org.apache.uima.analysis_component.AnalysisComponent#next()
   */
  public AbstractCas next() throws AnalysisEngineProcessException {
    // we should already have a JCas ready to return
    JCas toReturn = mJCases[mActiveJCas];
    mJCases[mActiveJCas] = null;
    assert toReturn != null;

    // now go read the next segment into our other JCas
    mActiveJCas ^= 1;
    if (mCurIndex < mDoc.length()) {
      readToNextNewline();
    } else {
      // end of the input document
      mCurIndex = 0;
      mHasNext = false;
    }

    return toReturn;
  }

  /*
   * (non-Javadoc)
   *
   * @see org.apache.uima.analysis_component.JCasSegmenter_ImplBase#getCasInstancesRequired()
   */
  public int getCasInstancesRequired() {
    return 2;
  }

  private void readToNextNewline() {
    // does this doc contain a newline?
    int nlIndex = mDoc.indexOf('\n', mCurIndex);
    if (nlIndex == -1) // no newline
    {
      // append entire rest of doc to active buffer
      int begin = mBuf.length(); // record start offset of new text
      mBuf.append(mDoc.substring(mCurIndex));
      if (mJCases[mActiveJCas] == null) {
        mJCases[mActiveJCas] = getEmptyJCas();
      }
      // add SourceDocumentInformation to active JCas
      SourceDocumentInformation sdi = new SourceDocumentInformation(mJCases[mActiveJCas]);
      sdi.setBegin(begin);
      sdi.setEnd(mBuf.length());
      sdi.setUri(getCasSourceUri(mCurrentInputCas));
      sdi.addToIndexes();
      mHasNext = false; // we need to see another input CAS before we can create output
      mCurIndex = 0;
    } else // yes, newline
    {
      // append doc up to newline
      int begin = mBuf.length(); // record start offset of new text
      mBuf.append(mDoc.substring(mCurIndex, nlIndex));
      if (mJCases[mActiveJCas] == null) {
        mJCases[mActiveJCas] = getEmptyJCas();
      }
      // add SourceDocumentInformation to active JCas
      SourceDocumentInformation sdi = new SourceDocumentInformation(mJCases[mActiveJCas]);
      sdi.setBegin(begin);
      sdi.setEnd(mBuf.length());
      sdi.setUri(getCasSourceUri(mCurrentInputCas));
      sdi.addToIndexes();
      // set doc text
      mJCases[mActiveJCas].setDocumentText(mBuf.toString());
      mBuf.setLength(0);
      mCurIndex = nlIndex + 1;
      mHasNext = true; // ready to output!
    }
  }

  private String getCasSourceUri(JCas jcas) {
    Iterator iter = jcas.getJFSIndexRepository().getAnnotationIndex(SourceDocumentInformation.type)
            .iterator();
    if (iter.hasNext()) {
      SourceDocumentInformation sdi = (SourceDocumentInformation) iter.next();
      return sdi.getUri();
    } else {
      return "unknown";
    }
  }
}
TOP

Related Classes of org.apache.uima.analysis_engine.impl.NewlineResegmenter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.