Package opennlp.tools.formats

Source Code of opennlp.tools.formats.LeipzigDoccatSampleStream

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.formats;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;

import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.PlainTextByLineStream;

/**
* Stream filter to produce document samples out of a Leipzig sentences.txt file.
* In the Leipzig corpus the encoding of the various senences.txt file is defined by
* the language. The language must be specified to produce the category tags and is used
* to determine the correct input encoding.
* <p>
* The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
* by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
* exactly the same tokenization during testing and training.
*/
public class LeipzigDoccatSampleStream extends
    FilterObjectStream<String, DocumentSample> {
 
  private final String language;
  private final int sentencesPerDocument;

  /**
   * Creates a new LeipzigDoccatSampleStream with the specified parameters.
   *
   * @param language the Leipzig input sentences.txt file
   * @param sentencesPerDocument the number of sentences which should be grouped into once {@link DocumentSample}
   * @param in the InputStream pointing to the contents of the sentences.txt input file
   */
  LeipzigDoccatSampleStream(String language, int sentencesPerDocument,
      InputStream in) throws IOException {
    super(new PlainTextByLineStream(in, "UTF-8"));
    this.language = language;
    this.sentencesPerDocument = sentencesPerDocument;
  }
 
  public DocumentSample read() throws IOException {

    int count = 0;

    StringBuilder sampleText = new StringBuilder();

    String line;
    while (count < sentencesPerDocument && (line = samples.read()) != null) {

      String tokens[] = SimpleTokenizer.INSTANCE.tokenize(line);
     
      if (tokens.length == 0) {
        throw new IOException("Empty lines are not allowed!");
      }
       
      // Always skip first token, that is the sentence number!
      for (int i = 1; i < tokens.length; i++) {
        sampleText.append(tokens[i]);
        sampleText.append(' ');
      }
     
      count++;
    }

   
    if (sampleText.length() > 0) {
      return new DocumentSample(language, sampleText.toString());
    }
 
    return null;
  }
}
TOP

Related Classes of opennlp.tools.formats.LeipzigDoccatSampleStream

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.