Examples of com.cloudera.cdk.data.DatasetRepository

com.cloudera.cdk.data.DatasetRepository

A logical repository (storage system) of {@link Dataset}s.

Implementations of {@code DatasetRepository} are storage systems that containzero or more {@link Dataset}s. A repository acts as a factory, as well as a registry, of datasets. Users can {@link #create(String,DatasetDescriptor)} anew {@link Dataset} with a name and schema, or retrieve a handle to anexisting dataset, by name, by way of the {@link #load(String)} method. Whilenot expressly forbidden, most repositories are expected to support only a single concrete {@link Dataset} implementation.

No guarantees are made as to the durability, reliability, or availability of the underlying storage. That is, a {@code DatasetRepository} could be ondisk, in memory, or some combination. See the implementation class for details about the guarantees it provides.

Implementations of {@link DatasetRepository} are immutable.
@see Dataset @see DatasetDescriptor


public class CreateStagedDataset extends Configured implements Tool {


  @Override
  public int run(String[] args) throws Exception {
    DatasetRepository repo = DatasetRepositories.open("repo:file:/tmp/data");


    // where the schema is stored
    URI schemaURI = URI.create("resource:simple-log.avsc");


    // create a Parquet dataset for long-term storage
    repo.create("logs", new DatasetDescriptor.Builder()
        .format(Formats.PARQUET)
        .schemaUri(schemaURI)
        .partitionStrategy(new PartitionStrategy.Builder()
            .year("timestamp", "year")
            .month("timestamp", "month")
            .day("timestamp", "day")
            .build())
        .build());


    // create an Avro dataset to temporarily hold data
    repo.create("logs-staging", new DatasetDescriptor.Builder()
        .format(Formats.AVRO)
        .schemaUri(schemaURI)
        .partitionStrategy(new PartitionStrategy.Builder()
            .day("timestamp", "day")
            .build())

View Full Code Here

  public int run(String[] args) throws Exception {
    // going to generate a lot of random log messages
    final Random rand = new Random();


    // open the repository
    final DatasetRepository repo = DatasetRepositories.open("repo:file:/tmp/data");


    // data is written to the staging dataset
    final Dataset<GenericRecord> staging = repo.load("logs-staging");
    final DatasetWriter<GenericRecord> writer = staging.newWriter();


    // this is going to build our simple log records
    final GenericRecordBuilder builder = new GenericRecordBuilder(
        staging.getDescriptor().getSchema());

View Full Code Here


  @Override
  public int run(String[] args) throws Exception {


    // Construct a local filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");


    // Load the events dataset
    Dataset<GenericRecord> events = repo.load("events");


    // Get a reader for the dataset and read all the events
    DatasetReader<GenericRecord> reader = events.newReader();
    try {
      reader.open();

View Full Code Here


    // Get a log4j logger
    Logger logger = Logger.getLogger(App.class);


    // Find the schema from the repository
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
    Schema schema = repo.load("events").getDescriptor().getSchema();


    // Build some events using the generic Avro API and log them using log4j
    GenericRecordBuilder builder = new GenericRecordBuilder(schema);
    for (long i = 0; i < 10; i++) {
      GenericRecord event = builder.set("id", i)

View Full Code Here


  @Override
  public int run(String[] args) throws Exception {


    // Construct a local filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");


    // Drop the events dataset
    boolean success = repo.delete("events");


    return success ? 0 : 1;
  }

View Full Code Here


  @Override
  public int run(String[] args) throws Exception {


    // Construct a local filesystem dataset repository rooted at /tmp/data
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");


    // Create a dataset of events with the Avro schema in the repository
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
        .schemaUri("resource:event.avsc")
        .build();
    repo.create("events", descriptor);


    return 0;
  }

View Full Code Here


public class DescribeDatasets extends Configured implements Tool {


  @Override
  public int run(String[] args) throws Exception {
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs://localhost:8020/user/cloudera");


    Schema ratingSchema = SchemaBuilder.record("Rating")
        .fields()
        .name("userId").type().intType().noDefault()
        .name("movieId").type().intType().noDefault()
        .name("rating").type().intType().noDefault()
        .name("timeInSeconds").type().intType().noDefault()
        .endRecord();


    // create
    repo.create("ratings", new DatasetDescriptor.Builder()
//        .location("hdfs:u.data")
        .format(Formats.CSV)
        .property("cdk.csv.delimiter", "\t")
        .schema(ratingSchema)
        .build());


//    movie id | movie title | release date | video release date |
//    IMDb URL | unknown | Action | Adventure | Animation |
//        Children's | Comedy | Crime | Documentary | Drama | Fantasy |
//    Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
//        Thriller | War | Western |
    Schema movieSchema = SchemaBuilder.record("Movie")
        .fields()
        .name("movieId").type().intType().noDefault()
        .name("title").type().stringType().noDefault()
        .name("releaseDate").type().stringType().noDefault()
        .name("videoReleaseDate").type().stringType().noDefault()
        .name("imdbURL").type().stringType().noDefault()
        // ignore genre fields for now
        .endRecord();


    repo.create("movies", new DatasetDescriptor.Builder()
//        .location("hdfs:u.item")
        .format(Formats.CSV)
        .property("cdk.csv.delimiter", "|")
        .schema(movieSchema)
        .build());

View Full Code Here


public class ReadMovies extends Configured implements Tool {


  @Override
  public int run(String[] args) throws Exception {
    DatasetRepository repo = DatasetRepositories.open("repo:hdfs://localhost:8020/user/cloudera");


    Dataset movies = repo.load("movies");
    DatasetReader reader = movies.newReader();
    try {
      reader.open();
      for (Object rec : reader) {
        System.err.println("Movie: " + rec);

View Full Code Here


  @Override
  public int run(String[] args) throws Exception {


    // Construct a local filesystem dataset repository rooted at /tmp/data
    DatasetRepository fsRepo = DatasetRepositories.open("repo:hdfs:/tmp/data");




    // Construct an HCatalog dataset repository using external Hive tables
    DatasetRepository hcatRepo = DatasetRepositories.open("repo:hive:/tmp/data");


    // Turn debug on while in development.
    getPipeline().enableDebug();
    getPipeline().getConfiguration().set("crunch.log.job.progress", "true");


    // Load the events dataset and get the correct partition to sessionize
    Dataset<StandardEvent> eventsDataset = fsRepo.load("events");
    Dataset<StandardEvent> partition;
    if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) {
      partition = getLatestPartition(eventsDataset);
    } else {
      partition = getPartitionForURI(eventsDataset, args[0]);
    }


    // Create a parallel collection from the working partition
    PCollection<StandardEvent> events = read(
        CrunchDatasets.asSource(partition, StandardEvent.class));


    // Process the events into sessions, using a combiner
    PCollection<Session> sessions = events
      .parallelDo(new DoFn<StandardEvent, Session>() {
        @Override
        public void process(StandardEvent event, Emitter<Session> emitter) {
          emitter.emit(Session.newBuilder()
              .setUserId(event.getUserId())
              .setSessionId(event.getSessionId())
              .setIp(event.getIp())
              .setStartTimestamp(event.getTimestamp())
              .setDuration(0)
              .setSessionEventCount(1)
              .build());
        }
      }, Avros.specifics(Session.class))
      .by(new MapFn<Session, Pair<Long, String>>() {
        @Override
        public Pair<Long, String> map(Session session) {
          return Pair.of(session.getUserId(), session.getSessionId());
        }
      }, Avros.pairs(Avros.longs(), Avros.strings()))
      .groupByKey()
      .combineValues(new CombineFn<Pair<Long, String>, Session>() {
        @Override
        public void process(Pair<Pair<Long, String>, Iterable<Session>> pairIterable,
            Emitter<Pair<Pair<Long, String>, Session>> emitter) {
          String ip = null;
          long startTimestamp = Long.MAX_VALUE;
          long endTimestamp = Long.MIN_VALUE;
          int sessionEventCount = 0;
          for (Session s : pairIterable.second()) {
            ip = s.getIp();
            startTimestamp = Math.min(startTimestamp, s.getStartTimestamp());
            endTimestamp = Math.max(endTimestamp, s.getStartTimestamp() + s.getDuration());
            sessionEventCount += s.getSessionEventCount();
          }
          emitter.emit(Pair.of(pairIterable.first(), Session.newBuilder()
              .setUserId(pairIterable.first().first())
              .setSessionId(pairIterable.first().second())
              .setIp(ip)
              .setStartTimestamp(startTimestamp)
              .setDuration(endTimestamp - startTimestamp)
              .setSessionEventCount(sessionEventCount)
              .build()));
        }
      })
      .parallelDo(new DoFn<Pair<Pair<Long, String>, Session>, Session>() {
        @Override
        public void process(Pair<Pair<Long, String>, Session> pairSession,
            Emitter<Session> emitter) {
          emitter.emit(pairSession.second());
        }
      }, Avros.specifics(Session.class));


    // Write the sessions to the "sessions" Dataset
    getPipeline().write(sessions, CrunchDatasets.asTarget(hcatRepo.load("sessions")),
        Target.WriteMode.APPEND);


    return run().succeeded() ? 0 : 1;
  }

View Full Code Here

0 1 2 3

TOP

Related Classes of com.cloudera.cdk.data.DatasetRepository

com.cloudera.cdk.data.filesystem.TestFileSystemURIs

com.cloudera.cdk.data.flume.Log4jAppender

com.cloudera.cdk.data.hcatalog.TestHiveURIs

com.cloudera.cdk.examples.data.CreateHCatalogUserDatasetGeneric

com.cloudera.cdk.examples.data.CreateProductDatasetPojo

com.cloudera.cdk.examples.data.CreateUserDatasetGeneric

com.cloudera.cdk.examples.data.CreateUserDatasetGenericParquet

com.cloudera.cdk.examples.data.CreateUserDatasetGenericPartitioned

com.cloudera.cdk.examples.data.DeleteHCatalogUserDataset

com.cloudera.cdk.examples.data.DeleteProductDataset

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.