/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.methods.StringRequestEntity;
import org.apache.log4j.Logger;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.ikanow.infinit.e.data_model.interfaces.harvest.EntityExtractorEnum;
import com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.store.document.AssociationPojo;
import com.ikanow.infinit.e.data_model.store.document.GeoPojo;
import com.ikanow.infinit.e.data_model.Globals;
import com.ikanow.infinit.e.data_model.Globals.Identity;
import com.ikanow.infinit.e.data_model.InfiniteEnums;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDailyLimitExceededException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
import com.ikanow.infinit.e.data_model.utils.DimensionUtility;
import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
public class ExtractorOpenCalais implements IEntityExtractor
{
@Override
public String getName() { return "opencalais"; }
private Map<EntityExtractorEnum, String> _capabilities = new HashMap<EntityExtractorEnum, String>();
private static final String CALAIS_URL = "http://api.opencalais.com/tag/rs/enrich";
private String CALAIS_LICENSE = null;
private HttpClient client;
private Map<String,EntityPojo> entityNameMap = new HashMap<String, EntityPojo>();
private Map<String, EventSchemaPojo> eventSchemas;
private Map<String, String> factOrEvent = new HashMap<String, String>();
private static final Logger logger = Logger.getLogger(ExtractorOpenCalais.class);
private static AtomicLong numInstances = new AtomicLong(0);
private static ShutdownHook shutdownHook = null;
private static AtomicLong num_extraction_collisions = new AtomicLong(0);
private static AtomicLong num_extraction_requests = new AtomicLong(0);
private static final int MAX_LENGTH = 99000;
private boolean bAddRawEventsToMetadata = false;
//_______________________________________________________________________
//_____________________________INITIALIZATION________________
//_______________________________________________________________________
public ExtractorOpenCalais()
{
PropertiesManager props = new PropertiesManager();
CALAIS_LICENSE = props.getExtractorKey("OpenCalais");
client = new HttpClient();
eventSchemas = loadEventSchemas();
//insert capabilities of this extractor
_capabilities.put(EntityExtractorEnum.Name, "OpenCalais");
_capabilities.put(EntityExtractorEnum.Quality, "1");
_capabilities.put(EntityExtractorEnum.GeotagExtraction, "true");
_capabilities.put(EntityExtractorEnum.MaxInputBytes, Integer.toString(MAX_LENGTH));
if (Identity.IDENTITY_SERVICE == Globals.getIdentity()) { // (ie not for API)
if ( 1 == numInstances.incrementAndGet() ) // (first time only...)
{
shutdownHook = new ShutdownHook();
Runtime.getRuntime().addShutdownHook(shutdownHook);
}
}
}
// Configuration: override global configuration on a per source basis
private boolean configured = false;
private void configure(SourcePojo source)
{
if (configured) {
return;
}
configured = true;
// SOURCE OVERRIDE
Boolean bWriteMetadata = null;
String apiKey = null;
if ((null != source) && (null != source.getExtractorOptions())) {
try {
String s = source.getExtractorOptions().get("app.opencalais.store_raw_events");
if (null != s) bWriteMetadata = Boolean.parseBoolean(s);
}
catch (Exception e){}
try {
apiKey = source.getExtractorOptions().get("app.opencalais.apiKeyOverride");
}
catch (Exception e){}
}
// DEFAULT CONFIGURATION
PropertiesManager properties = new PropertiesManager();
try {
if (null == bWriteMetadata) { // (ie not per source)
bWriteMetadata = properties.getExtractionCapabilityEnabled(getName(), "store_raw_events");
}
}
catch (Exception e) {}
// ACTUALLY DO CONFIGURATION
if (null != bWriteMetadata) {
bAddRawEventsToMetadata = bWriteMetadata;
}
if (null != apiKey) {
this.CALAIS_LICENSE = apiKey;
}
}
//_______________________________________________________________________
//_____________________________ENTITY EXTRACTOR FUNCTIONS________________
//_______________________________________________________________________
/**
* Takes a feed with some of the information stored in it
* such as title, desc, etc, and needs to parse the full
* text and add entities, events, and other metadata.
*
* @param partialDoc The feedpojo before extraction with fulltext field to extract on
* @return The feedpojo after extraction with entities, events, and full metadata
* @throws ExtractorDocumentLevelException
*/
@Override
public void extractEntities(DocumentPojo partialDoc) throws ExtractorDocumentLevelException
{
if (null == partialDoc) {
return;
}
configure(partialDoc.getTempSource());
num_extraction_requests.incrementAndGet();
try
{
if (null == partialDoc.getFullText()) {
return;
}
if (partialDoc.getFullText().length() < 32) { // Else don't waste Extractor call/error logging
return;
}
PostMethod method = createPostMethod(partialDoc.getFullText());
int responseCode = client.executeMethod(method);
if ( responseCode == HttpStatus.SC_FORBIDDEN) //INF-1101 forbidden gets thrown when too many concurrent requests occur, try 14 more times
{
int count = 1;
while ( count < 15 && responseCode == HttpStatus.SC_FORBIDDEN )
{
try {
Thread.sleep(1800);
}
catch (Exception e) {} // carry on...
responseCode = client.executeMethod(method); //attempt call again
count++;
}
num_extraction_collisions.addAndGet(count);
}
if ( responseCode == HttpStatus.SC_OK)
{
byte[] responseBytes = method.getResponseBody();
String response = new String(responseBytes, "UTF-8");
List<EntityPojo> entities = new ArrayList<EntityPojo>();
List<AssociationPojo> events = new ArrayList<AssociationPojo>();
ObjectMapper mapper = new ObjectMapper();
JsonNode root = mapper.readValue(response, JsonNode.class);
Iterator<JsonNode> iter = root.getElements();
Iterator<String> iterNames = root.getFieldNames();
List<JsonNode> eventNodes = new ArrayList<JsonNode>();
BasicDBList rawEventObjects = null;
while ( iter.hasNext() )
{
String currNodeName = iterNames.next();
JsonNode currNode = iter.next();
if (!currNodeName.equals("doc")) //we can assume these are the entities/topics
{
String typeGroup = currNode.get("_typeGroup").getTextValue();
//check typegroup to see if it is an entity
if ( typeGroup.equals("entities") )
{
try
{
EntityPojo ep = new EntityPojo();
//get what fields we can
ep.setType(currNode.get("_type").getTextValue());
try {
ep.setDimension(DimensionUtility.getDimensionByType(ep.getType()));
}
catch (java.lang.IllegalArgumentException e) {
ep.setDimension(EntityPojo.Dimension.What);
}
String name = "";
JsonNode nameNode = null;
try
{
nameNode = currNode.get("name");
name = nameNode.getTextValue();
}
catch (Exception ex )
{
logger.debug("Error parsing name node: " + currNode.toString());
continue;
}
ep.setActual_name(name);
ep.setRelevance(Double.parseDouble(currNode.get("relevance").getValueAsText()));
ep.setFrequency((long)currNode.get("instances").size());
//attempt to get resolutions if they exist
JsonNode resolutionNode = currNode.get("resolutions");
if ( null != resolutionNode )
{
//resolution nodes are arrays
JsonNode resolutionFirst = resolutionNode.get(0);
ep.setSemanticLinks(new ArrayList<String>());
ep.getSemanticLinks().add(resolutionFirst.get("id").getTextValue()); //this is a link to an alchemy page
ep.setDisambiguatedName(resolutionFirst.get("name").getTextValue());
//check if we need to create a geo object
if ( null != resolutionFirst.get("latitude") )
{
GeoPojo gp = new GeoPojo();
String lat = resolutionFirst.get("latitude").getValueAsText();
String lon = resolutionFirst.get("longitude").getValueAsText();
gp.lat = Double.parseDouble(lat);
gp.lon = Double.parseDouble(lon);
ep.setGeotag(gp);
}
}
else {
ep.setDisambiguatedName(name); // use actual name)
}
entityNameMap.put(currNodeName.toLowerCase(), ep);
entities.add(ep);
}
catch (Exception ex)
{
logger.error("Error creating event pojo from OpenCalaisNode: " + ex.getMessage(), ex);
}
}
else if ( typeGroup.equals("relations") )
{
eventNodes.add(currNode);
}
}
}
//handle events
if (bAddRawEventsToMetadata) {
// For now just re-process these into DB objects since we know that works...
rawEventObjects = new BasicDBList();
}
for ( JsonNode eventNode : eventNodes )
{
AssociationPojo event = parseEvent(eventNode);
//remove useless events (an event is useless if it only has a verb (guessing currently)
if ( null != event )
{
event = removeUselessEvents(event);
if ( null != event )
{
events.add(event);
}
}
if (bAddRawEventsToMetadata) {
BasicDBObject eventDbo = (BasicDBObject) com.mongodb.util.JSON.parse(eventNode.toString());
if (null != eventDbo) {
BasicDBObject transformObj = new BasicDBObject();
for (Map.Entry<String, Object> entries: eventDbo.entrySet()) {
if (entries.getValue() instanceof String) {
String val = (String) entries.getValue();
EntityPojo transformVal = findMappedEntityName(val);
if (null != transformVal) {
transformObj.put(entries.getKey(), transformVal.getIndex());
transformObj.put(entries.getKey() + "__hash", val);
}
else {
transformObj.put(entries.getKey(), val);
}
}
else {
transformObj.put(entries.getKey(), entries.getValue());
}
}
// (add to another list, which will get written to metadata)
rawEventObjects.add(transformObj);
}
}
}
if (bAddRawEventsToMetadata) {
partialDoc.addToMetadata("OpenCalaisEvents", rawEventObjects.toArray());
}
if (null != partialDoc.getEntities()) {
partialDoc.getEntities().addAll(entities);
partialDoc.setEntities(partialDoc.getEntities());
}
else if (null != entities) {
partialDoc.setEntities(entities);
}
if (null != partialDoc.getAssociations()) {
partialDoc.getAssociations().addAll(events);
partialDoc.setAssociations(partialDoc.getAssociations());
}
else if (null != events) {
partialDoc.setAssociations(events);
}
}
else // Error back from OC, presumably the input doc is malformed/too long
{
throw new InfiniteEnums.ExtractorDocumentLevelException("OpenCalais HTTP error code: " + Integer.toString(responseCode));
}
}
catch (Exception e)
{
//DEBUG
//e.printStackTrace();
logger.debug("OpenCalais", e);
//there was an error, so we return null instead
throw new InfiniteEnums.ExtractorDocumentLevelException(e.getMessage());
}
}
/**
* Removes useless events by returning null so they
* do not get saved
*
* Current strategy, if only a verb exists, remove this event
*
* @param event The eventpojo to check if its useless
* @return Null if event is useless, otherwise the event
*/
private AssociationPojo removeUselessEvents(AssociationPojo event)
{
if ( event.getVerb() != null &&
event.getEntity1() == null &&
event.getEntity2() == null &&
event.getTime_start() == null &&
event.getGeo_index() == null )
return null;
return event;
}
@Override
public void extractEntitiesAndText(DocumentPojo partialDoc)
throws ExtractorDailyLimitExceededException,
ExtractorDocumentLevelException
{
throw new RuntimeException("You must have a textEngine or text object in front of this featureEngine.");
}
/**
* Attempts to lookup if this extractor has a given capability,
* if it does returns value, otherwise null
*
* @param capability Extractor capability we are looking for
* @return Value of capability, or null if capability not found
*/
@Override
public String getCapability(EntityExtractorEnum capability)
{
return _capabilities.get(capability);
}
//_______________________________________________________________________
//_____________________________UTILITY FUNCTIONS_________________________
//_______________________________________________________________________
private PostMethod createPostMethod(String text) throws UnsupportedEncodingException {
if (text.length() > MAX_LENGTH) {
text = text.substring(0, MAX_LENGTH);
}
PostMethod method = new PostMethod(CALAIS_URL);
// Set mandatory parameters
method.setRequestHeader("x-calais-licenseID", CALAIS_LICENSE.trim());
// Set input content type
method.setRequestHeader("Content-Type", "text/raw; charset=UTF-8");
// Set response/output format
method.setRequestHeader("Accept", "application/json");
method.setRequestHeader("enableMetadataType","GenericRelations");
// Enable Social Tags processing
method.setRequestEntity(new StringRequestEntity(text,"text/plain","UTF-8"));
return method;
}
/**
* Checks if the entity is in our map and returns
* its value if so, otherwise just returns this entity.
*
* This is used for when OpenCalais references an entity in the form of
* http://s.opencalais.com/hash so we can get back an actual name like Obama
*
* @param entity The entity that could potentially be a hash
* @return The unhashed entity, just a string name
*/
private EntityPojo findMappedEntityName(String entity)
{
if ( entityNameMap.containsKey(entity) )
return entityNameMap.get(entity);
else
{
//Here we create a fake pojo to return so it will just use
//the text given (could return null and do a check but
//requires a lot of extra code
/*EntityPojo fakeEP = new EntityPojo();
fakeEP.disambiguous_name = entity;
fakeEP.actual_name = entity;
return fakeEP;*/
return null;
}
}
/**
* Parses the entity type into the correct noun verb noun columns
*
*
* @param nodename
* @param current_node
* @return
*/
public AssociationPojo parseEvent(JsonNode current_node)
{
AssociationPojo ep = null;
//handle the different types on entities
String entity_type = current_node.get("_type").getTextValue().toLowerCase();
String curr_ent;
//find eventschema for this type if one exists
EventSchemaPojo esp = eventSchemas.get(entity_type);
if ( esp != null )
{
ep = new AssociationPojo();
//entity 1
if ( null != esp.entity1column && null != current_node.get(esp.entity1column) )
{
JsonNode ent1node = current_node.get(esp.entity1column);
if ( ent1node.isArray() )
{
Iterator<JsonNode> entiter = ent1node.getElements();
curr_ent = entiter.next().getTextValue().toLowerCase();
EntityPojo matchEnt1 = findMappedEntityName(curr_ent);
if ( null != matchEnt1)
{
ep.setEntity1(matchEnt1.getActual_name());
ep.setEntity1_index(createEntityIndex(matchEnt1));
if ( ep.getGeotag() == null && matchEnt1.getGeotag() != null) //try to set geotag if it already hasn't been
ep.setGeotag(matchEnt1.getGeotag().deepCopy());
}
else
ep.setEntity1(curr_ent);
if ( entiter.hasNext())
{
curr_ent = entiter.next().getTextValue().toLowerCase();
EntityPojo matchEnt12 = findMappedEntityName(curr_ent);
if ( null != matchEnt12 )
{
ep.setEntity2(matchEnt12.getActual_name());
ep.setEntity2_index(createEntityIndex(matchEnt12));
if ( ep.getGeotag() == null && matchEnt12.getGeotag() != null) //try to set geotag if it already hasn't been
ep.setGeotag(matchEnt12.getGeotag().deepCopy());
}
else
ep.setEntity2(curr_ent);
}
}
else
{
curr_ent = current_node.get(esp.entity1column).getTextValue().toLowerCase();
EntityPojo matchEnt1Only = findMappedEntityName(curr_ent);
if ( null != matchEnt1Only )
{
ep.setEntity1(matchEnt1Only.getActual_name());
ep.setEntity1_index(createEntityIndex(matchEnt1Only));
if ( ep.getGeotag() == null && matchEnt1Only.getGeotag() != null ) //try to set geotag if it already hasn't been
ep.setGeotag(matchEnt1Only.getGeotag().deepCopy());
}
else
ep.setEntity1(curr_ent);
}
}
//entity 2
if ( null != esp.entity2column && null != current_node.get(esp.entity2column) )
{
JsonNode ent2node = current_node.get(esp.entity2column);
if ( ent2node.isTextual() )
{
curr_ent = current_node.get(esp.entity2column).getTextValue().toLowerCase();
EntityPojo matchEnt2 = findMappedEntityName(curr_ent);
if ( null != matchEnt2 )
{
ep.setEntity2(matchEnt2.getActual_name());
ep.setEntity2_index(createEntityIndex(matchEnt2));
if ( ep.getGeotag() == null && matchEnt2.getGeotag() != null ) //try to set geotag if it already hasn't been
ep.setGeotag(matchEnt2.getGeotag().deepCopy());
}
else
ep.setEntity2(curr_ent);
}
}
//verb and verb category (if there is a verb cat, assign that and then get column value)
if ( null != esp.verbcategory )
{
ep.setVerb_category(esp.verbcategory);
if ( null != esp.verbcolumn && null != current_node.get(esp.verbcolumn) )
{
JsonNode verbnode = current_node.get(esp.verbcolumn);
if ( verbnode.isTextual() )
{
ep.setVerb(current_node.get(esp.verbcolumn).getTextValue().toLowerCase());
EntityPojo verbent = findMappedEntityName(ep.getVerb());
if ( verbent != null )
ep.setVerb(verbent.getActual_name());
}
}
}
else if ( null != esp.verbcolumn && null != current_node.get(esp.verbcolumn) )
{
ep.setVerb(current_node.get(esp.verbcolumn).getTextValue().toLowerCase());
}
//location
if ( null != esp.locationcolumn && null != current_node.get(esp.locationcolumn) )
{
curr_ent = current_node.get(esp.locationcolumn).getTextValue().toLowerCase();
EntityPojo geoEnt = findMappedEntityName(curr_ent);
if ( geoEnt != null && geoEnt.getGeotag() != null )
{
ep.setGeo_index(createEntityIndex(geoEnt));
ep.setGeotag(geoEnt.getGeotag().deepCopy()); //location always over-rides geotag location
}
}
//time
if ( null != esp.timecolumnstart && null != current_node.get(esp.timecolumnstart) )
{
curr_ent = current_node.get(esp.timecolumnstart).getTextValue().toLowerCase();
if ( null != curr_ent )
{
ep.setTime_start(standardizeTime(curr_ent));
//System.out.println(current_node);
//add some time parsing to get ranges if possible
if ( null != esp.timecolumnend && null != current_node.get(esp.timecolumnend) )
{
curr_ent = current_node.get(esp.timecolumnend).getTextValue().toLowerCase();
String[] times = new String[2];
times[0] = ep.getTime_start();
times[1] = curr_ent;
parseEndDate(times);
ep.setTime_start(times[0]);
ep.setTime_end(times[1]);
}
}
}
//remove geotag if it does not have loc
if ( ep.getGeotag() != null && ep.getGeotag().lon == null)
ep.setGeotag(null);
ep.setAssociation_type(getEventType(ep));
}
else
{
// It's OK just to use the log for this, at some point could consider passing in HarvestContext
// so could use the per source logger
logger.info("OpenCalais extractor does not have an event_schema for: " + entity_type);
}
return ep;
}
/**
* Modifies both the time start and time end variables to create time ranges
* when possible.
*
* Takes a 2 String array [ timestart, timeend] so that it can be passed by
* refence and therefore both items can be modified
*
* @param times 2 String array consisting of index 0 = timestart and index 1 = timeend
*/
private void parseEndDate(String[] times)
{
String time_start = times[0];
String time_end = times[1];
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss");
int num_time_end = 0;
try
{
num_time_end = Integer.parseInt(time_end);
}
catch (Exception ex)
{
num_time_end = 0;
}
try
{
if ( num_time_end != 0 && time_end.length() == 4 ) //CASE 1: 2004 (just a year)
{
//just a year, span from jan 1 to dec 31
Calendar cal = Calendar.getInstance();
cal.set(num_time_end, 0, 1);
Date datestart = cal.getTime();
cal.set(num_time_end, 11,31);
Date dateend = cal.getTime();
time_start = sdf.format(datestart);
time_end = sdf.format(dateend);
}
else if ( time_end.substring(0,2).toLowerCase().equals("in") ) //CASE 2: in 2004 (in year) OR in May (in month)
{
try
{
//pull out year and span from jan1 to dec 31
num_time_end = Integer.parseInt(time_end.substring(3,7));
Calendar cal = Calendar.getInstance();
cal.set(num_time_end, 0, 1);
Date datestart = cal.getTime();
cal.set(num_time_end, 11,31);
Date dateend = cal.getTime();
time_start = sdf.format(datestart);
time_end = sdf.format(dateend);
}
catch(Exception ex)
{
//was not a year, try a month
String monthString = time_end.substring(3);
int monthint = parseMonth(monthString);
if ( monthint > -1 )
{
Calendar cal = Calendar.getInstance();
cal.set(num_time_end, monthint, 1);
Date datestart = cal.getTime();
cal.set(num_time_end, monthint,cal.getActualMaximum(Calendar.DATE));
Date dateend = cal.getTime();
time_start = sdf.format(datestart);
time_end = sdf.format(dateend);
}
else
{
time_end = null;
}
}
}
else if ( time_end.substring(0,4).toLowerCase().equals("last") ) //CASE 3: last june
{
String monthString = time_end.substring(5);
int monthint = parseMonth(monthString);
if ( monthint > -1 )
{
Calendar cal = Calendar.getInstance();
num_time_end = cal.get(Calendar.YEAR)-1;
cal.set(num_time_end, monthint, 1);
Date datestart = cal.getTime();
cal.set(num_time_end, monthint,cal.getActualMaximum(Calendar.DATE));
Date dateend = cal.getTime();
time_start = sdf.format(datestart);
time_end = sdf.format(dateend);
}
else
{
time_end = null;
}
}
else if ( time_end.split(" ").length == 2 ) //CASE 4: June 2004 (month and year)
{
String[] parts = time_end.split(" ");
//try to get month
int monthint = parseMonth(parts[0]);
if ( monthint > -1 )
{
try
{
num_time_end = Integer.parseInt(parts[1]);
Calendar cal = Calendar.getInstance();
cal.set(num_time_end, monthint, 1);
Date datestart = cal.getTime();
cal.set(num_time_end, monthint,cal.getActualMaximum(Calendar.DATE));
Date dateend = cal.getTime();
time_start = sdf.format(datestart);
time_end = sdf.format(dateend);
}
catch (Exception ex)
{
num_time_end = 0;
}
}
else
{
time_end = null;
}
}
else //didn't fall into one of our cases, we either dont need to parse or dont know how so null out
{
time_end = null;
}
}
catch (Exception ex)
{
//we had some sort of error, null out the end date, and leave start date whatever open calais extracted
time_end = null;
}
//System.out.println(time_start + " to " + time_end);
times[0] = time_start;
times[1] = time_end;
}
/**
* Returns an integer for the month given from 0(january) to 11(december)
* returns -1 if no match is found
*
* @param month string full name of month, e.g. January,may,JULY
* @return 0-11 for jan-dec or -1 on error
*/
private int parseMonth(String month)
{
month = month.toLowerCase();
if ( month.equals("january") )
return 0;
else if ( month.equals("february"))
return 1;
else if ( month.equals("march"))
return 2;
else if ( month.equals("april"))
return 3;
else if ( month.equals("may"))
return 4;
else if ( month.equals("june"))
return 5;
else if ( month.equals("july"))
return 6;
else if ( month.equals("august"))
return 7;
else if ( month.equals("september"))
return 8;
else if ( month.equals("october"))
return 9;
else if ( month.equals("november"))
return 10;
else if ( month.equals("december"))
return 11;
else return -1;
}
/**
* OpenCalais dates are in the format (yyyy-mm-dd)
* convert to yyyy-mm-dd?
*
* @param date
* @return
*/
private String standardizeTime(String date)
{
//attempt 1 try to convert yyyy-mm-dd
try
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Date parsedDate = sdf.parse(date);
SimpleDateFormat sdfEnd = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
return sdfEnd.format(parsedDate);
}
catch (Exception ex)
{
//error converting opencalais date
//logger.info("Could not extract correct dateformat for: " + date);
}
//attempt 2 try to convert yyyy
try
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy");
Date parsedDate = sdf.parse(date);
SimpleDateFormat sdfEnd = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
return sdfEnd.format(parsedDate);
}
catch (Exception ex)
{
//error converting opencalais date
//logger.info("Could not extract correct dateformat for: " + date);
}
return new StringBuffer("(").append(date).append(")").toString(); //just return what they gave us if all fails
}
/**
* Return the type of event based on following criteria,
* event can be either Event, Fact, or Summary
*
* Event: Must contain atleast 2 disambigous entities
* Fact: Generic Relation
* Summary: Anything else
*
* @param evt
* @return
*/
private String getEventType(AssociationPojo evt)
{
//count disambig ents
int disambig_count = 0;
if ( evt.getEntity1_index() != null ) disambig_count++;
if ( evt.getEntity2_index() != null ) disambig_count++;
if ( evt.getGeo_index() != null ) disambig_count++;
String sEventOrFact = factOrEvent.get(evt.getVerb_category());
if (null == sEventOrFact) { // (defaults to event)
sEventOrFact = "Event";
}
if ( disambig_count > 1 )
return sEventOrFact;
else
return "Summary";
}
/**
* Creates the entity gazateer entry if one exists
* for the current entity. We have to do this because
* the entity has not yet been added to the gaz and therefore will not have
* one otherwise
*
* @param ent
* @return
*/
private String createEntityIndex(EntityPojo ent)
{
if ( ent.getType() != null )
return new StringBuffer(ent.getDisambiguatedName().toLowerCase()).append('/').append(ent.getType().toLowerCase()).toString();
else
return ent.getDisambiguatedName();
}
/**
* Read in xml file and save schema examples
*
* @return A list of schemas that we can turn into events from open calais
*/
private Map<String,EventSchemaPojo> loadEventSchemas()
{
Map<String, EventSchemaPojo> schemas = new HashMap<String,EventSchemaPojo>();
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try
{
File file = new File(Globals.getConfigLocation()+"/event_schema.xml");
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse(file);
doc.getDocumentElement().normalize();
NodeList nodelist = doc.getElementsByTagName("event");
for ( int i = 0; i < nodelist.getLength(); i++ )
{
EventSchemaPojo esp = new EventSchemaPojo();
Node node = nodelist.item(i);
NodeList children = node.getChildNodes();
for ( int j = 0; j < children.getLength(); j++)
{
Node child = children.item(j);
String name = child.getNodeName();
// (note getNodeValue can be null, so can only be referenced in one of the if blocks below)
if ( name.equals("eventtype"))
esp.eventtype = child.getChildNodes().item(0).getNodeValue();
else if ( name.equals("entity1column"))
esp.entity1column = child.getChildNodes().item(0).getNodeValue();
else if ( name.equals("verbcolumn"))
esp.verbcolumn = child.getChildNodes().item(0).getNodeValue();
else if ( name.equals("verbcategory"))
esp.verbcategory = child.getChildNodes().item(0).getNodeValue();
else if ( name.equals("entity2column"))
esp.entity2column = child.getChildNodes().item(0).getNodeValue();
else if ( name.equals("locationcolumn"))
esp.locationcolumn = child.getChildNodes().item(0).getNodeValue();
else if ( name.equals("timecolumnstart"))
esp.timecolumnstart = child.getChildNodes().item(0).getNodeValue();
else if ( name.equals("timecolumnend"))
esp.timecolumnend = child.getChildNodes().item(0).getNodeValue();
else if ( name.equals("metatype")) {
factOrEvent.put(esp.verbcategory, child.getChildNodes().item(0).getNodeValue());
}
}
schemas.put(esp.eventtype, esp);
}
}
catch (Exception ex)
{
logger.error(ex.getMessage());
ex.printStackTrace();
}
return schemas;
}
class ShutdownHook extends Thread
{
public void run()
{
if ((null != num_extraction_requests) && (null != num_extraction_collisions)) {
if ((num_extraction_requests.get() > 0) || (num_extraction_collisions.get() > 0)){
StringBuilder sb = new StringBuilder();
sb.append("OpenCalais runtime report: ");
sb.append("num_of_extraction_requests=" + num_extraction_requests.get());
sb.append(" num_of_extraction_collisions=" + num_extraction_collisions.get());
logger.info(sb.toString());
}
}
// (did see a null ptr exception here, not clear how it happens - ie ^^^ for robustness)
}
}
}