Package com.cloudera.flume.reporter.histogram

Source Code of com.cloudera.flume.reporter.histogram.RegexGroupHistogramSink

/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.reporter.histogram;

import java.io.IOException;
import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.SinkFactory.SinkBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.reporter.MultiReporter;
import com.cloudera.flume.reporter.builder.SimpleRegexReporterBuilder;
import com.google.common.base.Preconditions;

/**
* This takes a regex and a group index and generates a histogram based on the
* value extracted. Values that do not match are not counted.
*
* For example: the group index and regex combo of: 3, (\d+):(\d+):(\d+)
*
* for the following values: 123:456:789, abc:def:xyz, 11:22:33, 55:66:33
*
* would result in a histogram with (value, count) : (789, 1), (33,2).
*
* NOTE: the NFA-based regex algorithm used by java.util.regex.* (and in this
* class) is slow and does not scale. It is fully featured but has an
* exponential worst case runnning time. This will be replaced with a faster but
* more memory hungry and less featured DFA-based regex algorithm. (We will lose
* capture groups).
*/
public class RegexGroupHistogramSink extends HistogramSink {
  Pattern pat;
  int grp;

  public RegexGroupHistogramSink(String name, Pattern pat, int grp) {
    super(name);
    this.pat = pat;
    this.grp = grp;
  }

  @Override
  public String extract(Event e) {
    String s = new String(e.getBody());
    Matcher m = pat.matcher(s);
    if (m.find()) {
      return m.group(grp);
    }
    return null;
  }

  public static SinkBuilder builder() {
    return new SinkBuilder() {
      @Override
      public EventSink build(Context context, String... argv) {
        Preconditions.checkArgument(argv.length == 1,
            "usage: regexhistospec(regexspecfile)");

        String fname = argv[0];
        SimpleRegexReporterBuilder srrb = new SimpleRegexReporterBuilder(fname);
        Collection<RegexGroupHistogramSink> sinks;
        try {
          sinks = srrb.load();
        } catch (IOException e) {
          throw new IllegalArgumentException(
              "Failed to create regex report from spec file " + fname + ": "
                  + e);
        }
        if (sinks.size() == 1)
          return sinks.iterator().next();

        EventSink snk = new MultiReporter(fname, sinks);
        return snk;
      }
    };
  }

  public static SinkBuilder builderSimple() {
    return new SinkBuilder() {
      @Override
      public EventSink build(Context context, String... argv) {
        Preconditions.checkArgument(argv.length == 3,
            "usage: regexhisto(name, regex, idx)");

        String name = argv[0];
        String regex = argv[1];
        Integer idx = Integer.parseInt(argv[2]);
        Pattern pat = Pattern.compile(regex);

        EventSink snk = new RegexGroupHistogramSink(name, pat, idx);
        return snk;

      }
    };
  }

}
TOP

Related Classes of com.cloudera.flume.reporter.histogram.RegexGroupHistogramSink

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.