Package com.cloudera.flume.reporter.histogram

Source Code of com.cloudera.flume.reporter.histogram.MultiGrepReporterSink

/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.reporter.histogram;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import org.arabidopsis.ahocorasick.AhoCorasick;
import org.arabidopsis.ahocorasick.SearchResult;

import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.SinkFactory.SinkBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.reporter.MultiReporter;
import com.cloudera.flume.reporter.ReportEvent;
import com.cloudera.flume.reporter.builder.MultiGrepReporterBuilder;
import com.cloudera.flume.reporter.charts.ChartPackage;
import com.cloudera.util.Histogram;
import com.google.common.base.Preconditions;

/**
* This uses an AhoCorasick multi string search state machine to search the body
* of events. It essentially takes a set of strings and makes them into a large
* trie that allows for a one pass traversal that can result in many substring
* matches. Since this structure can return multiple matches, we increment hit
* counts by 1 per event.
*
* Example:
*
* searching for : foo, fool, foolish, bar, barish
*
* in the event: "barfing foo at the bar was foolish."
*
* Would increment by 1: bar, foo, fool, foolish. 'bar' and 'foo' are
* incremented once although there were two hits. 'fool', and 'foolish' are each
* incremented once.
*
* We can't use the other HistogramSink because this extractor pulls out
* multiple values instead of single values (tags vs strict categorization).
*/
public class MultiGrepReporterSink<T> extends EventSink.Base {

  final String name;
  final AhoCorasick<T> aho;
  final Histogram<String> histo;
  final HistogramChartGen<String> chartgen;

  /**
   * This will default to returning the string on a match.
   */
  public MultiGrepReporterSink(String name, AhoCorasick<T> aho) {
    this.name = name;
    this.aho = aho;
    this.histo = new Histogram<String>();
    this.chartgen = ChartPackage.createHistogramGen(); // new
    // GoogleHistogramChartGen<T>();
  }

  /**
   * We use this instead of a constructor to build these.
   */
  public static MultiGrepReporterSink<String> build(String name, String[] strs) {
    // build the Aho multistring search Trie structure
    AhoCorasick<String> aho = new AhoCorasick<String>();
    for (String s : strs) {
      // will return Strings to identify matches.
      aho.add(s.getBytes(), s);
    }
    return new MultiGrepReporterSink<String>(name, aho);
  }

  // return a set of matches (do not return duplicates)
  public Collection<T> extract(Event e) {
    Iterator<SearchResult<T>> iter = aho.search(e.getBody());
    Set<T> results = new HashSet<T>();

    while (iter.hasNext()) {
      SearchResult<T> res = iter.next();
      for (T o : res.getOutputs()) {
        results.add(o);
      }
    }
    return results;
  }

  @Override
  public void append(Event e) throws IOException, InterruptedException {
    Collection<T> ts = extract(e);
    // if failed to extract, skip
    for (T t : ts) {
      histo.increment(t.toString());
    }
    super.append(e);
  }

  @Override
  public void open() throws IOException, InterruptedException {
    aho.prepare();
  }

  @Override
  public String getName() {
    return name;
  }

  /**
   * TODO make not use legacy html report.
   */
  @Override
  public ReportEvent getMetrics() {
    return ReportEvent.createLegacyHtmlReport(name, chartgen.generate(histo)
        + "<pre>" + histo + "</pre>");
  }

  public Histogram<String> getHistogram() {
    return histo;
  }

  public static SinkBuilder builder() {
    return new SinkBuilder() {
      @Override
      public EventSink build(Context context, String... argv) {
        Preconditions.checkArgument(argv.length == 2,
            "usage: multigrepspec(name, grepspecfile)");

        String name = argv[0];
        String fname = argv[1];
        MultiGrepReporterBuilder mgrb = new MultiGrepReporterBuilder(name,
            fname);
        Collection<MultiGrepReporterSink<String>> sinks;
        try {
          sinks = mgrb.load();
        } catch (IOException e) {
          throw new IllegalArgumentException(
              "Failed to create multigrep report named " + name
                  + " with spec from file " + fname + ": " + e);
        }
        if (sinks.size() == 1)
          return sinks.iterator().next();

        EventSink snk = new MultiReporter(fname, sinks);
        return snk;
      }
    };
  }

  public static SinkBuilder builderSimple() {
    return new SinkBuilder() {
      @Override
      public EventSink build(Context context, String... argv) {
        Preconditions.checkArgument(argv.length >= 2,
            "usage: multigrep(name, str1[,str2...])");

        String name = argv[0];
        String[] strings = Arrays.copyOfRange(argv, 1, argv.length);
        EventSink snk = MultiGrepReporterSink.build(name, strings);
        return snk;
      }
    };
  }
}
TOP

Related Classes of com.cloudera.flume.reporter.histogram.MultiGrepReporterSink

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.