Package com.cloudera.cdk.morphline.solr

Source Code of com.cloudera.cdk.morphline.solr.TokenizeTextBuilder$ReusableStringReader

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.morphline.solr;

import java.io.IOException;
import java.io.Reader;
import java.util.Collection;
import java.util.Collections;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;

import com.cloudera.cdk.morphline.api.Command;
import com.cloudera.cdk.morphline.api.CommandBuilder;
import com.cloudera.cdk.morphline.api.MorphlineCompilationException;
import com.cloudera.cdk.morphline.api.MorphlineContext;
import com.cloudera.cdk.morphline.api.MorphlineRuntimeException;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.AbstractCommand;
import com.google.common.base.Preconditions;
import com.typesafe.config.Config;

/**
* A command that uses the embedded Solr/Lucene Analyzer library to generate tokens from a text
* string, without sending data to a Solr server.
*/
public final class TokenizeTextBuilder implements CommandBuilder {

  @Override
  public Collection<String> getNames() {
    return Collections.singletonList("tokenizeText");
  }

  @Override
  public Command build(Config config, Command parent, Command child, MorphlineContext context) {
    return new TokenizeText(this, config, parent, child, context);
  }
 
 
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  private static final class TokenizeText extends AbstractCommand {
   
    private final String inputFieldName;
    private final String outputFieldName;
    private final Analyzer analyzer;
    private final CharTermAttribute token; // cached
    private final ReusableStringReader reader = new ReusableStringReader(); // cached
   
    public TokenizeText(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
      super(builder, config, parent, child, context);
      this.inputFieldName = getConfigs().getString(config, "inputField");
      this.outputFieldName = getConfigs().getString(config, "outputField");     
      String solrFieldType = getConfigs().getString(config, "solrFieldType");     
      Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
      SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
      LOG.debug("solrLocator: {}", locator);
      IndexSchema schema = locator.getIndexSchema();
      FieldType fieldType = schema.getFieldTypeByName(solrFieldType);
      if (fieldType == null) {
        throw new MorphlineCompilationException("Missing Solr field type in schema.xml for name: " + solrFieldType, config);
      }
      this.analyzer = fieldType.getAnalyzer();
      Preconditions.checkNotNull(analyzer);
      try { // register CharTermAttribute for later (implicit) reuse
        this.token = analyzer.tokenStream("content", reader).addAttribute(CharTermAttribute.class);
      } catch (IOException e) {
        throw new MorphlineCompilationException("Cannot create token stream", config, e);
      }
      Preconditions.checkNotNull(token);
      validateArguments();
    }

    @Override
    protected boolean doProcess(Record record) {
      try {
        List outputValues = record.get(outputFieldName);
        for (Object value : record.get(inputFieldName)) {
          reader.setValue(value.toString());
          TokenStream tokenStream = analyzer.tokenStream("content", reader);
          tokenStream.reset();
          while (tokenStream.incrementToken()) {
            if (token.length() > 0) { // incrementToken() updates the token!
              String tokenStr = new String(token.buffer(), 0, token.length());
              outputValues.add(tokenStr);
            }
          }
          tokenStream.end();
          tokenStream.close();
        }
      } catch (IOException e) {
        throw new MorphlineRuntimeException(e);
      }
     
      // pass record to next command in chain:
      return super.doProcess(record);
    }

  }
 
 
  // Copied from org.apache.lucene.document.Field.java from lucene-4.3.0
  /*
   * Licensed to the Apache Software Foundation (ASF) under one or more
   * contributor license agreements.  See the NOTICE file distributed with
   * this work for additional information regarding copyright ownership.
   * The ASF licenses this file to You under the Apache License, Version 2.0
   * (the "License"); you may not use this file except in compliance with
   * the License.  You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  private static final class ReusableStringReader extends Reader {
    private int pos = 0, size = 0;
    private String s = null;
   
    void setValue(String s) {
      this.s = s;
      this.size = s.length();
      this.pos = 0;
    }
   
    @Override
    public int read() {
      if (pos < size) {
        return s.charAt(pos++);
      } else {
        s = null;
        return -1;
      }
    }
   
    @Override
    public int read(char[] c, int off, int len) {
      if (pos < size) {
        len = Math.min(len, size-pos);
        s.getChars(pos, pos+len, c, off);
        pos += len;
        return len;
      } else {
        s = null;
        return -1;
      }
    }
   
    @Override
    public void close() {
      pos = size; // this prevents NPE when reading after close!
      s = null;
    }
  }
}
TOP

Related Classes of com.cloudera.cdk.morphline.solr.TokenizeTextBuilder$ReusableStringReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.