static HashSet<String> metadataSent = new HashSet<String>();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Article article = extractor.extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8));
String NULL_BYTE = "\u0000";
String colfPrefix = language + NULL_BYTE;
String indexPrefix = "fi" + NULL_BYTE;
if (article != null) {
int groupId = WikipediaMapper.getPartitionId(article, numGroups);
if(groupId != myGroup)
return;
Text partitionId = new Text(Integer.toString(WikipediaMapper.getPartitionId(article, numPartitions)));
// Create the mutations for the document.
// Row is partition id, colf is language0articleid, colq is fieldName\0fieldValue
Mutation m = new Mutation(partitionId);
for (Entry<String,Object> entry : article.getFieldValues().entrySet()) {
m.put(colfPrefix + article.getId(), entry.getKey() + NULL_BYTE + entry.getValue().toString(), cv, article.getTimestamp(), NULL_VALUE);
// Create mutations for the metadata table.
String metadataKey = entry.getKey() + METADATA_EVENT_COLUMN_FAMILY + language;
if (!metadataSent.contains(metadataKey)) {
Mutation mm = new Mutation(entry.getKey());
mm.put(METADATA_EVENT_COLUMN_FAMILY, language, cv, article.getTimestamp(), NULL_VALUE);
context.write(metadataTableName, mm);
metadataSent.add(metadataKey);
}
}
// Tokenize the content
Set<String> tokens = getTokens(article);
// We are going to put the fields to be indexed into a multimap. This allows us to iterate
// over the entire set once.
Multimap<String,String> indexFields = HashMultimap.create();
// Add the normalized field values
LcNoDiacriticsNormalizer normalizer = new LcNoDiacriticsNormalizer();
for (Entry<String,String> index : article.getNormalizedFieldValues().entrySet())
indexFields.put(index.getKey(), index.getValue());
// Add the tokens
for (String token : tokens)
indexFields.put(TOKENS_FIELD_NAME, normalizer.normalizeFieldValue("", token));
for (Entry<String,String> index : indexFields.entries()) {
// Create mutations for the in partition index
// Row is partition id, colf is 'fi'\0fieldName, colq is fieldValue\0language\0article id
m.put(indexPrefix + index.getKey(), index.getValue() + NULL_BYTE + colfPrefix + article.getId(), cv, article.getTimestamp(), NULL_VALUE);
// Create mutations for the global index
// Create a UID object for the Value
Builder uidBuilder = Uid.List.newBuilder();
uidBuilder.setIGNORE(false);
uidBuilder.setCOUNT(1);
uidBuilder.addUID(Integer.toString(article.getId()));
Uid.List uidList = uidBuilder.build();
Value val = new Value(uidList.toByteArray());
// Create mutations for the global index
// Row is field value, colf is field name, colq is partitionid\0language, value is Uid.List object
Mutation gm = new Mutation(index.getValue());
gm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
context.write(indexTableName, gm);
// Create mutations for the global reverse index
Mutation grm = new Mutation(StringUtils.reverse(index.getValue()));
grm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
context.write(reverseIndexTableName, grm);
// Create mutations for the metadata table.
String metadataKey = index.getKey() + METADATA_INDEX_COLUMN_FAMILY + language;
if (!metadataSent.contains(metadataKey)) {
Mutation mm = new Mutation(index.getKey());
mm.put(METADATA_INDEX_COLUMN_FAMILY, language + NULL_BYTE + LcNoDiacriticsNormalizer.class.getName(), cv, article.getTimestamp(), NULL_VALUE);
context.write(metadataTableName, mm);
metadataSent.add(metadataKey);
}
}
// Add the entire text to the document section of the table.
// row is the partition, colf is 'd', colq is language\0articleid, value is Base64 encoded GZIP'd document
m.put(DOCUMENT_COLUMN_FAMILY, colfPrefix + article.getId(), cv, article.getTimestamp(), new Value(Base64.encodeBase64(article.getText().getBytes())));
context.write(tablename, m);
} else {
context.getCounter("wikipedia", "invalid articles").increment(1);
}