Package org.apache.nutch.indexwriter.elastic

Source Code of org.apache.nutch.indexwriter.elastic.ElasticIndexWriter

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.indexwriter.elastic;

import static org.elasticsearch.node.NodeBuilder.nodeBuilder;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.nutch.indexer.IndexWriter;
import org.apache.nutch.indexer.NutchDocument;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.ListenableActionFuture;
import org.elasticsearch.action.bulk.BulkItemResponse;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.delete.DeleteRequestBuilder;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.ImmutableSettings.Builder;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.node.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
*/
public class ElasticIndexWriter implements IndexWriter {
  public static Logger LOG = LoggerFactory.getLogger(ElasticIndexWriter.class);

  private static final int DEFAULT_MAX_BULK_DOCS = 250;
  private static final int DEFAULT_MAX_BULK_LENGTH = 2500500;

  private Client client;
  private Node node;
  private String defaultIndex;

  private Configuration config;

  private BulkRequestBuilder bulk;
  private ListenableActionFuture<BulkResponse> execute;
  private int port = -1;
  private String host = null;
  private String clusterName = null;
  private int maxBulkDocs;
  private int maxBulkLength;
  private long indexedDocs = 0;
  private int bulkDocs = 0;
  private int bulkLength = 0;
  private boolean createNewBulk = false;

  @Override
  public void open(JobConf job, String name) throws IOException {
    clusterName = job.get(ElasticConstants.CLUSTER);

    host = job.get(ElasticConstants.HOST);
    port = job.getInt(ElasticConstants.PORT, 9300);

    Builder settingsBuilder = ImmutableSettings.settingsBuilder().classLoader(
        Settings.class.getClassLoader());

    BufferedReader reader = new BufferedReader(
        job.getConfResourceAsReader("elasticsearch.conf"));
    String line;
    String parts[];

    while ((line = reader.readLine()) != null) {
      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
        line.trim();
        parts = line.split("=");

        if (parts.length == 2) {
          settingsBuilder.put(parts[0].trim(), parts[1].trim());
        }
      }
    }

    if (StringUtils.isNotBlank(clusterName))
      settingsBuilder.put("cluster.name", clusterName);

    // Set the cluster name and build the settings
    Settings settings = settingsBuilder.build();

    // Prefer TransportClient
    if (host != null && port > 1) {
      client = new TransportClient(settings)
          .addTransportAddress(new InetSocketTransportAddress(host, port));
    } else if (clusterName != null) {
      node = nodeBuilder().settings(settings).client(true).node();
      client = node.client();
    }

    bulk = client.prepareBulk();
    defaultIndex = job.get(ElasticConstants.INDEX, "nutch");
    maxBulkDocs = job.getInt(ElasticConstants.MAX_BULK_DOCS,
        DEFAULT_MAX_BULK_DOCS);
    maxBulkLength = job.getInt(ElasticConstants.MAX_BULK_LENGTH,
        DEFAULT_MAX_BULK_LENGTH);
  }

  @Override
  public void write(NutchDocument doc) throws IOException {
    String id = (String) doc.getFieldValue("id");
    String type = doc.getDocumentMeta().get("type");
    if (type == null)
      type = "doc";
    IndexRequestBuilder request = client.prepareIndex(defaultIndex, type, id);

    Map<String, Object> source = new HashMap<String, Object>();

    // Loop through all fields of this doc
    for (String fieldName : doc.getFieldNames()) {
      if (doc.getField(fieldName).getValues().size() > 1) {
        source.put(fieldName, doc.getFieldValue(fieldName));
        // Loop through the values to keep track of the size of this
        // document
        for (Object value : doc.getField(fieldName).getValues()) {
          bulkLength += value.toString().length();
        }
      } else {
        source.put(fieldName, doc.getFieldValue(fieldName));
        bulkLength += doc.getFieldValue(fieldName).toString().length();
      }
    }
    request.setSource(source);

    // Add this indexing request to a bulk request
    bulk.add(request);
    indexedDocs++;
    bulkDocs++;

    if (bulkDocs >= maxBulkDocs || bulkLength >= maxBulkLength) {
      LOG.info("Processing bulk request [docs = " + bulkDocs + ", length = "
          + bulkLength + ", total docs = " + indexedDocs
          + ", last doc in bulk = '" + id + "']");
      // Flush the bulk of indexing requests
      createNewBulk = true;
      commit();
    }
  }

  @Override
  public void delete(String key) throws IOException {
    try {
      DeleteRequestBuilder builder = client.prepareDelete();
      builder.setIndex(defaultIndex);
      builder.setType("doc");
      builder.setId(key);
      builder.execute().actionGet();
    } catch (ElasticsearchException e) {
      throw makeIOException(e);
    }
  }

  public static IOException makeIOException(ElasticsearchException e) {
    final IOException ioe = new IOException();
    ioe.initCause(e);
    return ioe;
  }

  @Override
  public void update(NutchDocument doc) throws IOException {
    write(doc);
  }

  @Override
  public void commit() throws IOException {
    if (execute != null) {
      // wait for previous to finish
      long beforeWait = System.currentTimeMillis();
      BulkResponse actionGet = execute.actionGet();
      if (actionGet.hasFailures()) {
        for (BulkItemResponse item : actionGet) {
          if (item.isFailed()) {
            throw new RuntimeException("First failure in bulk: "
                + item.getFailureMessage());
          }
        }
      }
      long msWaited = System.currentTimeMillis() - beforeWait;
      LOG.info("Previous took in ms " + actionGet.getTookInMillis()
          + ", including wait " + msWaited);
      execute = null;
    }
    if (bulk != null) {
      if (bulkDocs > 0) {
        // start a flush, note that this is an asynchronous call
        execute = bulk.execute();
      }
      bulk = null;
    }
    if (createNewBulk) {
      // Prepare a new bulk request
      bulk = client.prepareBulk();
      bulkDocs = 0;
      bulkLength = 0;
    }
  }

  @Override
  public void close() throws IOException {
    // Flush pending requests
    LOG.info("Processing remaining requests [docs = " + bulkDocs
        + ", length = " + bulkLength + ", total docs = " + indexedDocs + "]");
    createNewBulk = false;
    commit();
    // flush one more time to finalize the last bulk
    LOG.info("Processing to finalize last execute");
    createNewBulk = false;
    commit();

    // Close
    client.close();
    if (node != null) {
      node.close();
    }
  }

  @Override
  public String describe() {
    StringBuffer sb = new StringBuffer("ElasticIndexWriter\n");
    sb.append("\t").append(ElasticConstants.CLUSTER)
        .append(" : elastic prefix cluster\n");
    sb.append("\t").append(ElasticConstants.HOST).append(" : hostname\n");
    sb.append("\t").append(ElasticConstants.PORT).append(" : port\n");
    sb.append("\t").append(ElasticConstants.INDEX)
        .append(" : elastic index command \n");
    sb.append("\t").append(ElasticConstants.MAX_BULK_DOCS)
        .append(" : elastic bulk index doc counts. (default 250) \n");
    sb.append("\t").append(ElasticConstants.MAX_BULK_LENGTH)
        .append(" : elastic bulk index length. (default 2500500 ~2.5MB)\n");
    return sb.toString();
  }

  @Override
  public void setConf(Configuration conf) {
    config = conf;
    String cluster = conf.get(ElasticConstants.CLUSTER);
    String host = conf.get(ElasticConstants.HOST);

    if (StringUtils.isBlank(cluster) && StringUtils.isBlank(host)) {
      String message = "Missing elastic.cluster and elastic.host. At least one of them should be set in nutch-site.xml ";
      message += "\n" + describe();
      LOG.error(message);
      throw new RuntimeException(message);
    }
  }

  @Override
  public Configuration getConf() {
    return config;
  }
}
TOP

Related Classes of org.apache.nutch.indexwriter.elastic.ElasticIndexWriter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.