Package com.cloudera.cdk.tools

Source Code of com.cloudera.cdk.tools.CombinedLogFormatConverter$ConvertFn

/**
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.tools;

import com.cloudera.cdk.data.Dataset;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetRepositories;
import com.cloudera.cdk.data.DatasetRepository;
import com.cloudera.cdk.data.crunch.CrunchDatasets;
import com.google.common.io.Resources;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import org.apache.crunch.PCollection;
import org.apache.crunch.Target;
import org.apache.crunch.types.avro.AvroType;
import org.apache.crunch.types.avro.Avros;
import org.apache.crunch.util.CrunchTool;
import org.apache.hadoop.util.ToolRunner;

/**
* A tool for converting files in
* <a href="http://en.wikipedia.org/wiki/Common_Log_Format">Combined Log Format</a> to a
* {@link com.cloudera.cdk.data.Dataset}.
*/
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "SE_NO_SERIALVERSIONID",
    justification = "Serialization not needed between different versions of this class")
public class CombinedLogFormatConverter extends CrunchTool {

  private static final String LOG_PATTERN = "^(\\S+) (\\S+) (\\S+) \\[" +
      "([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+|-) \"([^\"]*)\" \"([^\"]+)\"";

  @Override
  public int run(String... args) throws Exception {
    if (args.length != 3) {
      System.err.println("Usage: " + CombinedLogFormatConverter.class.getSimpleName() +
          " <input> <dataset_uri> <dataset name>");
      return 1;
    }
    String input = args[0];
    String datasetUri = args[1];
    String datasetName = args[2];

    Schema schema = new Schema.Parser().parse(
        Resources.getResource("combined_log_format.avsc").openStream());

    // Create the dataset
    DatasetRepository repo = DatasetRepositories.open(datasetUri);
    DatasetDescriptor datasetDescriptor = new DatasetDescriptor.Builder()
        .schema(schema).build();
    Dataset<Object> dataset = repo.create(datasetName, datasetDescriptor);

    // Run the job
    final String schemaString = schema.toString();
    AvroType<GenericData.Record> outputType = Avros.generics(schema);
    PCollection<String> lines = readTextFile(input);
    PCollection<GenericData.Record> records = lines.parallelDo(
        new ConvertFn(schemaString), outputType);
    getPipeline().write(records, CrunchDatasets.asTarget(dataset),
        Target.WriteMode.APPEND);
    run();
    return 0;
  }

  private String asString(String s) {
    if ("-".equals(s) || "\"-\"".equals(s)) {
      return null;
    }
    return s;
  }

  private Integer asInt(String s) {
    if ("-".equals(s)) {
      return null;
    }
    return Integer.parseInt(s);
  }

  public static void main(String... args) throws Exception {
    int rc = ToolRunner.run(new CombinedLogFormatConverter(), args);
    System.exit(rc);
  }

  @edu.umd.cs.findbugs.annotations.SuppressWarnings({"SE_INNER_CLASS",
      "SE_TRANSIENT_FIELD_NOT_RESTORED"})
  private class ConvertFn extends DoFn<String, GenericData.Record> {
    private final String schemaString;
    transient Pattern pattern;
    transient GenericRecordBuilder recBuilder;

    public ConvertFn(String schemaString) {
      this.schemaString = schemaString;
    }

    public void initialize() {
      pattern = Pattern.compile(LOG_PATTERN);
      recBuilder = new GenericRecordBuilder(new Schema.Parser().parse(schemaString));
    }

    @Override
    public void process(String line, Emitter<GenericData.Record> emitter) {
      Matcher matcher = pattern.matcher(line);
      if (matcher.matches()) {
        // parse line into components
        recBuilder.set("host", asString(matcher.group(1)));
        recBuilder.set("rfc931_identity", asString(matcher.group(2)));
        recBuilder.set("username", asString(matcher.group(3)));
        recBuilder.set("datetime", asString(matcher.group(4)));
        recBuilder.set("request", asString(matcher.group(5)));
        recBuilder.set("http_status_code", asInt(matcher.group(6)));
        recBuilder.set("response_size", asInt(matcher.group(7)));
        recBuilder.set("referrer", asString(matcher.group(8)));
        recBuilder.set("user_agent", asString(matcher.group(9)));
        emitter.emit(recBuilder.build());
      } else {
        System.err.println("No match: " + line);
      }

    }
  }
}
TOP

Related Classes of com.cloudera.cdk.tools.CombinedLogFormatConverter$ConvertFn

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.