Package eu.stratosphere.test.recordJobs.graph

Source Code of eu.stratosphere.test.recordJobs.graph.EnumTrianglesRdfFoaf$CloseTriads

/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/

package eu.stratosphere.test.recordJobs.graph;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;

import eu.stratosphere.api.common.Plan;
import eu.stratosphere.api.common.Program;
import eu.stratosphere.api.common.ProgramDescription;
import eu.stratosphere.api.java.record.operators.FileDataSink;
import eu.stratosphere.api.java.record.operators.FileDataSource;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFields;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsFirstExcept;
import eu.stratosphere.api.java.record.functions.JoinFunction;
import eu.stratosphere.api.java.record.functions.ReduceFunction;
import eu.stratosphere.api.java.record.io.CsvOutputFormat;
import eu.stratosphere.api.java.record.io.DelimitedInputFormat;
import eu.stratosphere.api.java.record.operators.JoinOperator;
import eu.stratosphere.api.java.record.operators.ReduceOperator;
import eu.stratosphere.types.Record;
import eu.stratosphere.types.StringValue;
import eu.stratosphere.util.Collector;

/**
* Implementation of the triangle enumeration example Pact program.
* The program expects a file with RDF triples (in XML serialization) as input. Triples must be separated by linebrakes.
*
* The program filters for foaf:knows predicates to identify relationships between two entities (typically persons).
* Relationships are interpreted as edges in a social graph. Then the program enumerates all triangles which are build
* by edges in that graph.
*
* Usually, triangle enumeration is used as a pre-processing step to identify highly connected subgraphs.
* The algorithm was published as MapReduce job by J. Cohen in "Graph Twiddling in a MapReduce World".
* The Pact version was described in "MapReduce and PACT - Comparing Data Parallel Programming Models" (BTW 2011).
*/
public class EnumTrianglesRdfFoaf implements Program, ProgramDescription {

  private static final long serialVersionUID = 1L;

  /**
   * Reads RDF triples and filters on the foaf:knows RDF predicate.
   * The foaf:knows RDF predicate indicates that the RDF subject and object (typically of type foaf:person) know each
   * other.
   * Therefore, knowing connections between people are extracted and handles as graph edges.
   * The EdgeInFormat filters all rdf triples with foaf:knows predicates. The subjects and objects URLs are
   * compared.
   * The lexicographically smaller URL is set as the first field of the output record, the greater one as the second field.
   */
  public static class EdgeInFormat extends DelimitedInputFormat {
    private static final long serialVersionUID = 1L;

    private final StringValue rdfSubj = new StringValue();
    private final StringValue rdfPred = new StringValue();
    private final StringValue rdfObj = new StringValue();
   
    @Override
    public Record readRecord(Record target, byte[] bytes, int offset, int numBytes) {
      final int limit = offset + numBytes;
      int startPos = offset;
     
      // read RDF subject
      startPos = parseVarLengthEncapsulatedStringField(bytes, startPos, limit, ' ', rdfSubj, '"');
      if (startPos < 0) {
        // invalid record, exit
        return null;
      }
      // read RDF predicate
      startPos = parseVarLengthEncapsulatedStringField(bytes, startPos, limit, ' ', rdfPred, '"');
      if (startPos < 0 || !rdfPred.getValue().equals("<http://xmlns.com/foaf/0.1/knows>")) {
        // invalid record or predicate is not a foaf-knows predicate, exit
        return null;
      }
      // read RDF object
      startPos = parseVarLengthEncapsulatedStringField(bytes, startPos, limit, ' ', rdfObj, '"');
      if (startPos < 0) {
        // invalid record, exit
        return null;
      }

      // compare RDF subject and object
      if (rdfSubj.compareTo(rdfObj) <= 0) {
        // subject is smaller, subject becomes first attribute, object second
        target.setField(0, rdfSubj);
        target.setField(1, rdfObj);
      } else {
        // object is smaller, object becomes first attribute, subject second
        target.setField(0, rdfObj);
        target.setField(1, rdfSubj);
      }

      return target; 
    }
   
    /*
     * Utility method to efficiently parse encapsulated, variable length strings
     */
    private int parseVarLengthEncapsulatedStringField(byte[] bytes, int startPos, int limit, char delim, StringValue field, char encaps) {
     
      boolean isEncaps = false;
     
      // check whether string is encapsulated
      if (bytes[startPos] == encaps) {
        isEncaps = true;
      }
     
      if (isEncaps) {
        // string is encapsulated
        for (int i = startPos; i < limit; i++) {
          if (bytes[i] == encaps) {
            if (bytes[i+1] == delim) {
              field.setValueAscii(bytes, startPos, i-startPos+1);
              return i+2;
            }
          }
        }
        return -1;
      } else {
        // string is not encapsulated
        int i;
        for (i = startPos; i < limit; i++) {
          if (bytes[i] == delim) {
            field.setValueAscii(bytes, startPos, i-startPos);
            return i+1;
          }
        }
        if (i == limit) {
          field.setValueAscii(bytes, startPos, i-startPos);
          return i+1;
        } else {
          return -1;
        }
      }
    }
  }

  /**
   * Builds triads (open triangle) from all two edges that share a vertex.
   * The common vertex is
   */
  @ConstantFields(0)
  public static class BuildTriads extends ReduceFunction implements Serializable {
    private static final long serialVersionUID = 1L;
   
    // list of non-matching vertices
    private final ArrayList<StringValue> otherVertices = new ArrayList<StringValue>(32);
   
    // matching vertex
    private final StringValue matchVertex = new StringValue();
   
    // mutable output record
    private final Record result = new Record();
   
    // initialize list of non-matching vertices for one vertex
    public BuildTriads() {
      this.otherVertices.add(new StringValue());
    }

    @Override
    public void reduce(Iterator<Record> records, Collector<Record> out) throws Exception {
      // read the first edge
      final Record rec = records.next();
      // read the matching vertex
      rec.getFieldInto(0, this.matchVertex);
      // read the non-matching vertex and add it to the list
      rec.getFieldInto(1, this.otherVertices.get(0));
     
      // set the matching vertex in the output record
      this.result.setField(0, this.matchVertex);
     
      int numEdges = 1;
      // while there are more edges
      while (records.hasNext()) {

        // read the next edge
        final Record next = records.next();
       
        final StringValue myVertex;
        // obtain an object to store the non-matching vertex
        if (numEdges >= this.otherVertices.size()) {
          // we need an additional vertex object
          // create the object
          myVertex = new StringValue();
          // and put it in the list
          this.otherVertices.add(myVertex);
        } else {
          // we reuse a previously created object from the list
          myVertex = this.otherVertices.get(numEdges);
        }
        // read the non-matching vertex into the obtained object
        next.getFieldInto(1, myVertex);
       
        // combine the current edge with all vertices in the non-matching vertex list
        for (int i = 0; i < numEdges; i++) {
          // get the other non-matching vertex
          final StringValue otherVertex = this.otherVertices.get(i);
          // add my and other vertex to the output record depending on their ordering
          if (otherVertex.compareTo(myVertex) < 0) {
            this.result.setField(1, otherVertex);
            this.result.setField(2, myVertex);
            out.collect(this.result);
          } else {
            next.setField(2, otherVertex);
            out.collect(next);
          }
        }
       
        numEdges++;
      }
    }   
  }

  /**
   * Matches all missing edges with existing edges from input.
   * If the missing edge for a triad is found, the triad is transformed to a triangle by adding the missing edge.
   */
  @ConstantFieldsFirstExcept({})
  public static class CloseTriads extends JoinFunction implements Serializable {
    private static final long serialVersionUID = 1L;

    @Override
    public void join(Record triad, Record missingEdge, Collector<Record> out) throws Exception {
      // emit triangle (already contains missing edge at field 0
      out.collect(triad);
    }
  }

  /**
   * Assembles the Plan of the triangle enumeration example Pact program.
   */
  @Override
  public Plan getPlan(String... args) {

    // parse job parameters
    int numSubTasks   = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    String edgeInput = (args.length > 1 ? args[1] : "");
    String output    = (args.length > 2 ? args[2] : "");

    FileDataSource edges = new FileDataSource(new EdgeInFormat(), edgeInput, "BTC Edges");
   
    ReduceOperator buildTriads = ReduceOperator.builder(new BuildTriads(), StringValue.class, 0)
      .name("Build Triads")
      .build();

    JoinOperator closeTriads = JoinOperator.builder(new CloseTriads(), StringValue.class, 1, 0)
      .keyField(StringValue.class, 2, 1)
      .name("Close Triads")
      .build();
    closeTriads.setParameter("INPUT_LEFT_SHIP_STRATEGY", "SHIP_REPARTITION_HASH");
    closeTriads.setParameter("INPUT_RIGHT_SHIP_STRATEGY", "SHIP_REPARTITION_HASH");
    closeTriads.setParameter("LOCAL_STRATEGY", "LOCAL_STRATEGY_HASH_BUILD_SECOND");

    FileDataSink triangles = new FileDataSink(new CsvOutputFormat(), output, "Output");
    CsvOutputFormat.configureRecordFormat(triangles)
      .recordDelimiter('\n')
      .fieldDelimiter(' ')
      .field(StringValue.class, 0)
      .field(StringValue.class, 1)
      .field(StringValue.class, 2);

    triangles.setInput(closeTriads);
    closeTriads.setSecondInput(edges);
    closeTriads.setFirstInput(buildTriads);
    buildTriads.setInput(edges);

    Plan plan = new Plan(triangles, "Enumerate Triangles");
    plan.setDefaultParallelism(numSubTasks);
    return plan;
  }

  /*
   * (non-Javadoc)
   * @see eu.stratosphere.pact.common.plan.PlanAssemblerDescription#getDescription()
   */
  @Override
  public String getDescription() {
    return "Parameters: [numSubStasks] [inputRDFTriples] [outputTriangles]";
  }
}
TOP

Related Classes of eu.stratosphere.test.recordJobs.graph.EnumTrianglesRdfFoaf$CloseTriads

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.