package com.ontology2.bakemono.pse3;
import java.io.IOException;
import java.util.regex.Pattern;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import com.google.common.base.Function;
import com.google.common.base.Splitter;
import com.google.common.cache.LoadingCache;
import com.hp.hpl.jena.graph.Node;
import com.hp.hpl.jena.graph.Node_URI;
import com.hp.hpl.jena.graph.Triple;
import com.ontology2.bakemono.abstractions.KeyValueAcceptor;
import com.ontology2.bakemono.abstractions.PrimaryKeyValueAcceptor;
import com.ontology2.bakemono.jena.NodePair;
import com.ontology2.bakemono.jena.WritableTriple;
import com.ontology2.bakemono.primitiveTriples.PrimitiveTriple;
import com.ontology2.bakemono.primitiveTriples.PrimitiveTripleCodec;
import com.ontology2.rdf.JenaUtil;
public class PSE3Mapper extends Mapper<LongWritable,Text,WritableTriple,WritableTriple> {
private static final LongWritable ONE = new LongWritable(1);
private static org.apache.commons.logging.Log logger = LogFactory.getLog(PSE3Mapper.class);
final LoadingCache<String,Node> nodeParser=JenaUtil.createNodeParseCache();
final static PrimitiveTripleCodec p3Codec=new PrimitiveTripleCodec();
private final Pattern $escape=Pattern.compile("[$][0-9A-F]{4}");
//
// all of these are deliberately in the default scope so that the test classes
// can mess with them
//
KeyValueAcceptor<WritableTriple,WritableTriple> accepted;
@Override
public void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
accepted=new PrimaryKeyValueAcceptor(context);
}
Function<String,String> nodePreprocessor=new Unescape$();
int myCnt=0;
WritableTriple writableTriple;
@Override
public void map(LongWritable arg0, Text arg1, Context c) throws IOException, InterruptedException {
PrimitiveTriple row3=p3Codec.decode(arg1.toString());
try {
String rawSubject = nodePreprocessor.apply(row3.getSubject());
String rawPredicate = nodePreprocessor.apply(row3.getPredicate());
String rawObject = nodePreprocessor.apply(row3.getObject());
Node_URI subject=(Node_URI) nodeParser.get(rawSubject);
Node_URI predicate=(Node_URI) nodeParser.get(rawPredicate);
Node object=nodeParser.get(rawObject);
Triple realTriple=new Triple(subject,predicate,object);
writableTriple = new WritableTriple(realTriple);
} catch(Throwable e) {
String factString=row3.getSubject()+"\t"+row3.getPredicate()+"\t"+row3.getSubject()+"\t.";
logger.warn("Caught exception while parsing fact ["+factString+"]",e);
reject(c, row3);
return;
}
accepted.write(writableTriple,writableTriple,c);
incrementCounter(c,PSE3Counters.ACCEPTED,1);
}
//
// Barf on $xxxx escape sequences in any data type
//
private boolean has$escape(Node that) {
return $escape.matcher(that.toString()).find();
}
private void reject(Context c, PrimitiveTriple row3) throws IOException,
InterruptedException {
incrementCounter(c,PSE3Counters.REJECTED,1);
}
//
// this code prevents failing test because the mock object Context we are passing back
// always returns null from getCounter... With a more sophisticated mock object perhaps
// the system will produce individual mocks for each counter so we can watch what
// happens with counters
//
private void incrementCounter(Context context,Enum <?> counterId,long amount) {
Counter counter=context.getCounter(counterId);
if(counter!=null) {
counter.increment(amount);
};
};
@Override
protected void cleanup(org.apache.hadoop.mapreduce.Mapper.Context context)
throws IOException, InterruptedException {
super.cleanup(context);
}
public class Unescape$ implements Function<String,String>{
@Override
public String apply(String input) {
if(input.startsWith("<") && input.endsWith(">"))
return applyToNode(input);
if(input.startsWith("\"") && input.endsWith("\""))
return applyToRawString(input);
return input;
}
public String applyToNode(String input) {
return unescapeFreebaseKey(input);
}
// XXX -- note that this is "not implemented", is this what we want?
public String applyToRawString(String input) {
return input;
}
}
// would L.U.T. be faster?
public static int digitToHex(char digit) {
if(digit<='9' && digit>='0') {
return digit-'0';
}
if(digit<='F' && digit>='A') {
return digit-'A'+10;
}
return -1;
}
public static String unescapeFreebaseKey(String in) {
int from=0;
int to=in.indexOf('$');
if(to==-1)
return in;
StringBuilder out=new StringBuilder();
do {
out.append(in.substring(from,to));
if(in.length()<to+5)
return in;
int a=digitToHex(in.charAt(to+1));
int b=digitToHex(in.charAt(to+2));
int c=digitToHex(in.charAt(to+3));
int d=digitToHex(in.charAt(to+4));
if (a!=-1 && b!=-1 && c!=-1 && d!=-1) {
out.append((char) ((a << 12) + (b << 8) + (c << 4) + d));
} else {
return in;
}
from=to+5;
to=in.indexOf('$',to+5);
} while(to!=-1);
if(from<in.length()) {
out.append(in.substring(from));
}
return out.toString();
}
}