Package com.cloudera.recordbreaker.analyzer

Examples of com.cloudera.recordbreaker.analyzer.SchemaDescriptor


            FileSummaryData fsd2 = fsa.getFileSummaryData(fid2);
            DataDescriptor dd1 = fsd1.getDataDescriptor();
            DataDescriptor dd2 = fsd2.getDataDescriptor();
            List<SchemaDescriptor> sds1 = dd1.getSchemaDescriptor();
            List<SchemaDescriptor> sds2 = dd2.getSchemaDescriptor();                       
            SchemaDescriptor sd1 = sds1.get(0);
            SchemaDescriptor sd2 = sds2.get(0);

            List<Schema> unionFreeSchemas1 = SchemaUtils.getUnionFreeSchemasByFrequency(sd1, 100, true);
            Schema schema1 = unionFreeSchemas1.get(0);
            List<Schema> unionFreeSchemas2 = SchemaUtils.getUnionFreeSchemasByFrequency(sd2, 100, true);
            Schema schema2 = unionFreeSchemas2.get(0);
View Full Code Here


        return result;
      }
    }

    if (sds.size() > 0) {
      SchemaDescriptor sd = sds.get(0);
      Schema schema = sd.getSchema();

      //
      // Step 1.  Figure out the hierarchical labels from the Schema.
      // These are the fields we'll grab from each tuple.
      //
      // Doing so entails "unrolling" the schemas that contain unions.
      // That is, translating such schemas into a set of union-free schemas.
      //
      List<List<List<DataField>>> perSchemaTupleLists = new ArrayList<List<List<DataField>>>();
      List<List<List<DataField>>> dataOrderTupleLists = new ArrayList<List<List<DataField>>>();
      List<Integer> schemaOrder = new ArrayList<Integer>();
      List<SchemaPair> schemaFrequency = new ArrayList<SchemaPair>();

      int numRows = 0;
      TreeMap<String, Schema> uniqueUnrolledSchemas = new TreeMap<String, Schema>();
      for (Iterator it = sd.getIterator(); it.hasNext(); ) {
        GenericData.Record gr = (GenericData.Record) it.next();
        List<Schema> grSchemas = SchemaUtils.unrollUnionsWithData(schema, gr, false);
        if (grSchemas != null) {
          for (Schema grs: grSchemas) {
            if (uniqueUnrolledSchemas.get(grs.toString()) == null) {
              uniqueUnrolledSchemas.put(grs.toString(), grs);
            }
          }
        }
        if (numRows >= MAX_ROWS) {
          break;
        }
        numRows++;
      }
      List<Schema> allSchemas = new ArrayList(uniqueUnrolledSchemas.values());
      List<List<String>> schemaLabelLists = new ArrayList<List<String>>();

      for (int i = 0; i < allSchemas.size(); i++) {
        Schema s1 = allSchemas.get(i);
        schemaLabelLists.add(SchemaUtils.flattenNames(s1));
        perSchemaTupleLists.add(new ArrayList<List<DataField>>());
        schemaFrequency.add(new SchemaPair(i, 0));
      }

      //
      // Step 2.  Build the set of rows for display.  One row per tuple.
      //
      numRows = 0;
      boolean incompleteFileScan = false;
      int lastBestIdx = -1;
      boolean hasMoreRows = false;
      for (Iterator it = sd.getIterator(); it.hasNext(); ) {
        GenericData.Record gr = (GenericData.Record) it.next();
        if (numRows >= MAX_ROWS) {
          hasMoreRows = true;
          incompleteFileScan = true;
          break;
View Full Code Here

TOP

Related Classes of com.cloudera.recordbreaker.analyzer.SchemaDescriptor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.