Package org.kitesdk.examples.spark

Source Code of org.kitesdk.examples.spark.CorrelateEventsTask$CorrelationKey

/*
* Copyright 2014 Cloudera, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.kitesdk.examples.spark;

import com.google.common.base.Objects;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.kitesdk.data.event.CorrelatedEvents;
import org.kitesdk.data.event.StandardEvent;
import org.kitesdk.data.mapreduce.DatasetKeyInputFormat;
import org.kitesdk.data.mapreduce.DatasetKeyOutputFormat;
import scala.Tuple2;

public class CorrelateEventsTask implements Serializable {

  private static final long FIVE_MIN_MILLIS = TimeUnit.MINUTES.toMillis(5);
  String eventsUri;
  String correlatedEventsUri;

  public CorrelateEventsTask(String eventsUri, String correlatedEventsUri) {
    this.eventsUri = eventsUri;
    this.correlatedEventsUri = correlatedEventsUri;
  }

  /*
   * This task correlates events based on IP address and timestamp. The goal is
   * to find any "click" events that come from the same IP address and occur
   * within 5 minutes of an "alert" event. The process works by first converting
   * timestamps into 5 minute increments. This means each event will be mapped
   * to the nearest 5 minute mark before the event happened and the nearest
   * 5 minute mark after the event happened. These rounded timestamps are
   * combined with the IP address of the event to do an approximate self join of
   * the data. The events are then iterated over to check for two conditions:
   *
   *   1) There is an alert event in the same bucket
   *   2) That alert is actually less than 5 minutes apart from the given click
   *
   * The task will write out all of the "alert" events that have at least one
   * "click" event from the same IP address and within 5 minutes along with the
   * list of "click" events that were correlated.
   */
  public void run() throws IOException {
    Configuration conf = new Configuration();
    DatasetKeyInputFormat.configure(conf).readFrom(eventsUri).withType(StandardEvent.class);
    DatasetKeyOutputFormat.configure(conf).writeTo(correlatedEventsUri).withType(CorrelatedEvents.class);

    // Create our Spark configuration and get a Java context
    SparkConf sparkConf = new SparkConf()
        .setAppName("Correlate Events")
        // Configure the use of Kryo serialization including our Avro registrator
        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .set("spark.kryo.registrator", "org.kitesdk.examples.spark.AvroKyroRegistrator");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);

    JavaPairRDD<StandardEvent, Void> events = sparkContext.newAPIHadoopRDD(conf,
        DatasetKeyInputFormat.class, StandardEvent.class, Void.class);

    // Map each event to two correlation keys. One with the IP address and the
    // nearest 5 minute interval that happened before the event and one with the
    // IP address and the nearest 5 minute interval that happened after the event
    JavaPairRDD<CorrelationKey, StandardEvent> mappedEvents = events.flatMapToPair(
        new PairFlatMapFunction<Tuple2<StandardEvent, Void>, CorrelationKey, StandardEvent>() {
          @Override
          public Iterable<Tuple2<CorrelationKey, StandardEvent>>
              call(Tuple2<StandardEvent, Void> t) throws Exception {
            List<Tuple2<CorrelationKey, StandardEvent>> result =
                new ArrayList<Tuple2<CorrelationKey, StandardEvent>>(2);

            StandardEvent event = t._1();
            long loTimestamp = createLoTimestamp(event.getTimestamp());
            long hiTimestamp = createHiTimestamp(event.getTimestamp());
            String ip = event.getIp().toString();

            result.add(new Tuple2<CorrelationKey, StandardEvent>(
                new CorrelationKey(loTimestamp, ip), event));
            result.add(new Tuple2<CorrelationKey, StandardEvent>(
                new CorrelationKey(hiTimestamp, ip), event));

            return result;
          }
        });

    // Group the events by they correlation key
    JavaPairRDD<CorrelationKey, Iterable<StandardEvent>> groupedEvents = mappedEvents.groupByKey();

    // Generate potential matches by creating a list of alerts along with the
    // matched list of clicks. If no alerts were found with this correlation
    // key, then output an empty pair
    JavaPairRDD<List<StandardEvent>, List<StandardEvent>> potentialMatches = groupedEvents.mapToPair(
        new PairFunction<Tuple2<CorrelationKey, Iterable<StandardEvent>>, List<StandardEvent>, List<StandardEvent>>(){

          @Override
          public Tuple2<List<StandardEvent>, List<StandardEvent>> call(Tuple2<CorrelationKey, Iterable<StandardEvent>> t) throws Exception {
            Iterable<StandardEvent> allEvents = t._2();
            List<StandardEvent> alerts = new ArrayList<StandardEvent>();
            List<StandardEvent> clicks = new ArrayList<StandardEvent>();

            for (StandardEvent event : allEvents) {
              if (event.getEventDetails() != null &&
                  event.getEventDetails().containsKey(new Utf8("type")) &&
                  "alert".equals(event.getEventDetails().get(new Utf8("type")).toString())) {
                alerts.add(event);
              } else if (event.getEventDetails() != null &&
                  event.getEventDetails().containsKey(new Utf8("type")) &&
                  "click".equals(event.getEventDetails().get(new Utf8("type")).toString())) {
                clicks.add(event);
              }
            }

            if (alerts.isEmpty()) {
              return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, alerts);
            } else {
              return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, clicks);
            }
          }
        });

    // Verify that the matched events are true matches (i.e. the timestamps
    // are really less than or equal to 5 minutes apart
    JavaPairRDD<CorrelatedEvents, Void> matches = potentialMatches.flatMapToPair(
        new PairFlatMapFunction<Tuple2<List<StandardEvent>, List<StandardEvent>>, CorrelatedEvents, Void>() {

        @Override
        public Iterable<Tuple2<CorrelatedEvents, Void>> call(Tuple2<List<StandardEvent>, List<StandardEvent>> t) throws Exception {
          List<Tuple2<CorrelatedEvents, Void>> results =
              new ArrayList<Tuple2<CorrelatedEvents, Void>>();
          List<StandardEvent> alerts = t._1();
          List<StandardEvent> clicks = t._2();

          for (StandardEvent alert : alerts) {
            List<StandardEvent> correlated = new ArrayList<StandardEvent>();
            for (StandardEvent click : clicks) {
              if (Math.abs(alert.getTimestamp() - click.getTimestamp())
                  <= FIVE_MIN_MILLIS) {
                correlated.add(click);
              }
            }
            if (!correlated.isEmpty()) {
              results.add(new Tuple2(CorrelatedEvents.newBuilder()
                  .setEvent(alert)
                  .setCorrelated(correlated)
                  .build(), null));
            }
          }

          return results;
        }
      });

    // Write the data to a Kite dataset
    matches.saveAsNewAPIHadoopFile("dummy", CorrelatedEvents.class, Void.class,
        DatasetKeyOutputFormat.class, conf);
  }

  private static long createLoTimestamp(long timestamp) {
    return timestamp - (timestamp % FIVE_MIN_MILLIS) - FIVE_MIN_MILLIS;
  }

  private static long createHiTimestamp(long timestamp) {
    return timestamp - (timestamp % FIVE_MIN_MILLIS) + FIVE_MIN_MILLIS;
  }

  private static class CorrelationKey implements Serializable {
    Long timeStamp;
    String ip;

    public CorrelationKey(Long timeStamp, String ip) {
      this.timeStamp = timeStamp;
      this.ip = ip;
    }

    public String getIp() {
      return ip;
    }

    public void setIp(String ip) {
      this.ip = ip;
    }

    public Long getTimeStamp() {
      return timeStamp;
    }

    public void setTimeStamp(Long timeStamp) {
      this.timeStamp = timeStamp;
    }

    @Override
    public boolean equals(Object obj) {
      if (obj == null) {
        return false;
      }
      if (getClass() != obj.getClass()) {
        return false;
      }
      final CorrelationKey other = (CorrelationKey) obj;

      return Objects.equal(this.timeStamp, other.timeStamp) &&
          Objects.equal(this.ip, other.ip);
    }

    @Override
    public int hashCode() {
      return Objects.hashCode(timeStamp, ip);
    }
  }
}
TOP

Related Classes of org.kitesdk.examples.spark.CorrelateEventsTask$CorrelationKey

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.