Examples of com.splout.db.hadoop.TableBuilder

com.splout.db.hadoop.TableBuilder
This builder can be used to obtain {@link Table} beans. These beans can then be used to obtain a{@link TablespaceSpec} through {@link TablespaceBuilder}.

  private static Schema SCHEMA_2 = new Schema("schema2", Fields.parse("id2:string, value2:string"));
  private static Schema SCHEMA_3 = new Schema("schema3", Fields.parse("id3:int, value3:string"));


  @Test
  public void testCorrectTablespace() throws TableBuilderException, TablespaceBuilderException {
    Table table1 = new TableBuilder(SCHEMA_1).addCSVTextFile("foo1.txt").partitionBy("id1").build();
    Table table2 = new TableBuilder(SCHEMA_2).addCSVTextFile("foo2.txt").partitionBy("id2").build();


    TablespaceBuilder builder = new TablespaceBuilder();
    builder.add(table1);
    builder.add(table2);

View Full Code Here

    builder.build();
  }


  @Test
  public void testCorrectTablespaceMultiplePartitionBy() throws TableBuilderException, TablespaceBuilderException {
    Table table1 = new TableBuilder(SCHEMA_1).addCSVTextFile("foo1.txt").partitionBy("id1", "value1").build();
    Table table2 = new TableBuilder(SCHEMA_2).addCSVTextFile("foo2.txt").partitionBy("id2", "value2").build();


    TablespaceBuilder builder = new TablespaceBuilder();
    builder.add(table1);
    builder.add(table2);

View Full Code Here

    builder.build();
  }
  
  @Test
  public void testCorrectTablespaceWithReplicated() throws TableBuilderException, TablespaceBuilderException {
    Table table1 = new TableBuilder(SCHEMA_1).addCSVTextFile("foo1.txt").partitionBy("id1").build();
    Table table2 = new TableBuilder(SCHEMA_2).addCSVTextFile("foo2.txt").partitionBy("id2").build();
    Table table3 = new TableBuilder(SCHEMA_3).addCSVTextFile("foo3.txt").replicateToAll().build();


    TablespaceBuilder builder = new TablespaceBuilder();
    builder.add(table1);
    builder.add(table2);
    builder.add(table3);

View Full Code Here


  // ---- Number of partitions must be specified for Tablespace ---- //
  
  @Test(expected=TablespaceBuilderException.class)
  public void testMissingNPartitions() throws TableBuilderException, TablespaceBuilderException {
    Table table1 = new TableBuilder(SCHEMA_1).addCSVTextFile("foo1.txt").partitionBy("id1").build();
    Table table2 = new TableBuilder(SCHEMA_2).addCSVTextFile("foo2.txt").partitionBy("id2").build();


    TablespaceBuilder builder = new TablespaceBuilder();
    builder.add(table1);
    builder.add(table2);

View Full Code Here

  
  // ---- At least one table must be partitioned ---- //
  
  @Test(expected=TablespaceBuilderException.class)
  public void testNoPartitionTable() throws TableBuilderException, TablespaceBuilderException {
    Table table1 = new TableBuilder(SCHEMA_1).addCSVTextFile("foo1.txt").replicateToAll().build();
    Table table2 = new TableBuilder(SCHEMA_2).addCSVTextFile("foo2.txt").replicateToAll().build();


    TablespaceBuilder builder = new TablespaceBuilder();
    builder.add(table1);
    builder.add(table2);

View Full Code Here

  
  // ---- There must not be collision between table names (schema names) ---- //
  
  @Test(expected=TablespaceBuilderException.class)
  public void testTableNameCollision() throws TableBuilderException, TablespaceBuilderException {
    Table table1 = new TableBuilder(SCHEMA_1).addCSVTextFile("foo1.txt").partitionBy("id1").build();
    Table table2 = new TableBuilder(SCHEMA_1).addCSVTextFile("foo2.txt").partitionBy("id1").build();
    
    TablespaceBuilder builder = new TablespaceBuilder();
    builder.add(table1);
    builder.add(table2);

View Full Code Here

    
  // ---- Partition field must be of same kind across tables ---- //
  
  @Test(expected=TablespaceBuilderException.class)
  public void testInvalidCoPartition() throws TableBuilderException, TablespaceBuilderException {
    Table table1 = new TableBuilder(SCHEMA_1).addCSVTextFile("foo1.txt").partitionBy("id1").build();
    Table table2 = new TableBuilder(SCHEMA_3).addCSVTextFile("foo2.txt").partitionBy("id3").build();


    TablespaceBuilder builder = new TablespaceBuilder();
    builder.add(table1);
    builder.add(table2);

View Full Code Here

    builder.build();
  }
  
  @Test(expected=TablespaceBuilderException.class)
  public void testInvalidCoPartitionMultipleFields() throws TableBuilderException, TablespaceBuilderException {
    Table table1 = new TableBuilder(SCHEMA_1).addCSVTextFile("foo1.txt").partitionBy("value1", "id1").build();
    Table table2 = new TableBuilder(SCHEMA_3).addCSVTextFile("foo2.txt").partitionBy("value3", "id3").build();


    TablespaceBuilder builder = new TablespaceBuilder();
    builder.add(table1);
    builder.add(table2);

View Full Code Here

      // define the schema of the input files: projectcode, pagename, pageviews, bytes
      Schema fileSchema = new Schema("pagecountsfile",
          Fields.parse("projectcode:string, pagename:string, pageviews:int, bytes:long"));


      // instantiate a TableBuilder
      TableBuilder tableBuilder = new TableBuilder(tableSchema);


      // for every input file...
      for(FileStatus fileStatus : fileStatuses) {
        String fileName = fileStatus.getPath().getName().toString();
        // strip the date and the hour from the file name
        String fileDate = fileName.split("-")[1];
        String fileHour = fileName.split("-")[2].substring(0, 2);
        // instantiate a custom RecordProcessor to process the records of this file
        PageCountsRecordProcessor recordProcessor = new PageCountsRecordProcessor(tableSchema, fileDate,
            fileHour);
        // use the tableBuilder method for adding each of the files to the mix
        tableBuilder.addCSVTextFile(fileStatus.getPath(), ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER,
            TupleTextInputFormat.NO_ESCAPE_CHARACTER, false, false, TupleTextInputFormat.NO_NULL_STRING,
            fileSchema, recordProcessor);
      }


      // partition the dataset by pagename - which should give a fair even distribution.
      tableBuilder.partitionBy("pagename");
      // create a compound index on pagename, date so that typical queries for the dataset will be fast
      tableBuilder.createIndex("pagename", "date");


      long nonExactPageSize = memoryForIndexing / 32000; // number of pages
      int pageSize = (int) Math.pow(2, (int) Math.round(Math.log(nonExactPageSize) / Math.log(2)));
      Log.info("Pagesize = " + pageSize + " as memory for indexing was [" + memoryForIndexing
          + "] and there are 32000 pages.");


      tableBuilder.initialSQL("pragma page_size=" + pageSize);
      // insertion order is very important for optimizing query speed because it makes data be co-located in disk
      tableBuilder.insertionSortOrder(OrderBy.parse("pagename:asc, date:asc"));


      // instantiate a TablespaceBuilder
      TablespaceBuilder tablespaceBuilder = new TablespaceBuilder();


      // we will partition this dataset in as many partitions as:
      tablespaceBuilder.setNPartitions(nPartitions);
      tablespaceBuilder.add(tableBuilder.build());
      // we turn a specific SQLite pragma on for making autocomplete queries fast
      tablespaceBuilder.initStatements("pragma case_sensitive_like=true;");


      HadoopUtils.deleteIfExists(outFs, outPath);

View Full Code Here

TOP

Related Classes of com.splout.db.hadoop.TableBuilder

com.datasalt.pangool.io.Schema

com.datasalt.pangool.io.Schema.Field

com.datasalt.pangool.tuplemr.mapred.lib.input.CascadingTupleInputFormat

com.datasalt.pangool.tuplemr.mapred.lib.input.HCatTupleInputFormat

com.datasalt.pangool.tuplemr.mapred.lib.input.TupleInputFormat

com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat

com.splout.db.examples.PageCountsExample

com.splout.db.hadoop.TableSpec.FieldIndex

com.splout.db.hadoop.TestTablespaceBuilder

org.apache.hadoop.fs.Path

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.