Package com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais

Source Code of com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais.ExtractorOpenCalais$ShutdownHook

/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais;

import java.io.File;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.methods.StringRequestEntity;
import org.apache.log4j.Logger;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import com.ikanow.infinit.e.data_model.interfaces.harvest.EntityExtractorEnum;
import com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.store.document.AssociationPojo;
import com.ikanow.infinit.e.data_model.store.document.GeoPojo;
import com.ikanow.infinit.e.data_model.Globals;
import com.ikanow.infinit.e.data_model.Globals.Identity;
import com.ikanow.infinit.e.data_model.InfiniteEnums;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDailyLimitExceededException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
import com.ikanow.infinit.e.data_model.utils.DimensionUtility;
import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;

public class ExtractorOpenCalais implements IEntityExtractor
  @Override
  public String getName() { return "opencalais"; }
   
  private Map<EntityExtractorEnum, String> _capabilities = new HashMap<EntityExtractorEnum, String>();
  private static final String CALAIS_URL = "http://api.opencalais.com/tag/rs/enrich";
  private String CALAIS_LICENSE = null;
    private HttpClient client;
    private Map<String,EntityPojo> entityNameMap = new HashMap<String, EntityPojo>();
    private Map<String, EventSchemaPojo> eventSchemas;
    private Map<String, String> factOrEvent = new HashMap<String, String>();
   
   
    private static final Logger logger = Logger.getLogger(ExtractorOpenCalais.class);
    private static AtomicLong numInstances = new AtomicLong(0);
    private static ShutdownHook shutdownHook = null;
    private static AtomicLong num_extraction_collisions = new AtomicLong(0);
    private static AtomicLong num_extraction_requests = new AtomicLong(0);

  private static final int MAX_LENGTH = 99000
   
    private boolean bAddRawEventsToMetadata = false;
   
  //_______________________________________________________________________
  //_____________________________INITIALIZATION________________
  //_______________________________________________________________________

  public ExtractorOpenCalais()
  {   
    PropertiesManager props = new PropertiesManager();
    CALAIS_LICENSE = props.getExtractorKey("OpenCalais");
   
    client = new HttpClient();
    eventSchemas = loadEventSchemas();
    //insert capabilities of this extractor
    _capabilities.put(EntityExtractorEnum.Name, "OpenCalais");
    _capabilities.put(EntityExtractorEnum.Quality, "1");
    _capabilities.put(EntityExtractorEnum.GeotagExtraction, "true");
    _capabilities.put(EntityExtractorEnum.MaxInputBytes, Integer.toString(MAX_LENGTH));
   
    if (Identity.IDENTITY_SERVICE == Globals.getIdentity()) { // (ie not for API)
      if ( 1 == numInstances.incrementAndGet() ) // (first time only...)
      {
        shutdownHook = new ShutdownHook();
        Runtime.getRuntime().addShutdownHook(shutdownHook);
      }
    }
  }
  // Configuration: override global configuration on a per source basis
 
  private boolean configured = false;
 
  private void configure(SourcePojo source)
  {
    if (configured) {
      return;
    }
    configured = true;
   
    // SOURCE OVERRIDE
   
    Boolean bWriteMetadata = null;
    String apiKey = null;
   
    if ((null != source) && (null != source.getExtractorOptions())) {
      try {
        String s = source.getExtractorOptions().get("app.opencalais.store_raw_events");
        if (null != s) bWriteMetadata = Boolean.parseBoolean(s);
      }
      catch (Exception e){}
      try {
        apiKey = source.getExtractorOptions().get("app.opencalais.apiKeyOverride");
      }
      catch (Exception e){}
    }   
   
    // DEFAULT CONFIGURATION
   
    PropertiesManager properties = new PropertiesManager();
   
    try {
      if (null == bWriteMetadata) { // (ie not per source)
        bWriteMetadata = properties.getExtractionCapabilityEnabled(getName(), "store_raw_events");     
      }
    }
    catch (Exception e) {}

    // ACTUALLY DO CONFIGURATION
   
    if (null != bWriteMetadata) {
      bAddRawEventsToMetadata = bWriteMetadata;
    }
    if (null != apiKey) {
      this.CALAIS_LICENSE = apiKey;
    }
  } 
  //_______________________________________________________________________
  //_____________________________ENTITY EXTRACTOR FUNCTIONS________________
  //_______________________________________________________________________
 
  /**
   * Takes a feed with some of the information stored in it
   * such as title, desc, etc, and needs to parse the full
   * text and add entities, events, and other metadata.
   *
   * @param partialDoc The feedpojo before extraction with fulltext field to extract on
   * @return The feedpojo after extraction with entities, events, and full metadata
   * @throws ExtractorDocumentLevelException
   */
  @Override
  public void extractEntities(DocumentPojo partialDoc) throws ExtractorDocumentLevelException
  {
    if (null == partialDoc) {
      return;
    }
    configure(partialDoc.getTempSource());
   
    num_extraction_requests.incrementAndGet();
    try
    {
      if (null == partialDoc.getFullText()) {
        return;
      }
      if (partialDoc.getFullText().length() < 32) { // Else don't waste Extractor call/error logging
        return;
     
     
      PostMethod method = createPostMethod(partialDoc.getFullText());
      int responseCode = client.executeMethod(method);
     
      if ( responseCode == HttpStatus.SC_FORBIDDEN) //INF-1101 forbidden gets thrown when too many concurrent requests occur, try 14 more times
      {
        int count = 1;
        while ( count < 15 && responseCode == HttpStatus.SC_FORBIDDEN )
        {
          try {
            Thread.sleep(1800);
          }
          catch (Exception e) {} // carry on...

          responseCode = client.executeMethod(method); //attempt call again
          count++;
        }
        num_extraction_collisions.addAndGet(count);
      }
     
      if ( responseCode == HttpStatus.SC_OK)
      {
        byte[] responseBytes = method.getResponseBody();
        String response = new String(responseBytes, "UTF-8");
        List<EntityPojo> entities = new ArrayList<EntityPojo>();       
        List<AssociationPojo> events = new ArrayList<AssociationPojo>();
        ObjectMapper mapper = new ObjectMapper();
        JsonNode root = mapper.readValue(response, JsonNode.class);
        Iterator<JsonNode> iter = root.getElements();
        Iterator<String> iterNames = root.getFieldNames();
        List<JsonNode> eventNodes = new ArrayList<JsonNode>();
        BasicDBList rawEventObjects = null;
        while ( iter.hasNext() )
        {
          String currNodeName = iterNames.next();
          JsonNode currNode = iter.next();
          if (!currNodeName.equals("doc")) //we can assume these are the entities/topics
          {
            String typeGroup = currNode.get("_typeGroup").getTextValue();
            //check typegroup to see if it is an entity
            if ( typeGroup.equals("entities") )
            {
              try
              {
                EntityPojo ep = new EntityPojo();
                //get what fields we can         
                ep.setType(currNode.get("_type").getTextValue());
                try {
                  ep.setDimension(DimensionUtility.getDimensionByType(ep.getType()));
                }
                catch (java.lang.IllegalArgumentException e) {
                  ep.setDimension(EntityPojo.Dimension.What);                 
                }
                String name = "";
                JsonNode nameNode = null;
                try
                {
                  nameNode = currNode.get("name");
                  name = nameNode.getTextValue();
                }
                catch (Exception ex )
                {                 
                  logger.debug("Error parsing name node: " + currNode.toString());
                  continue;
                }
                ep.setActual_name(name);
                ep.setRelevance(Double.parseDouble(currNode.get("relevance").getValueAsText()));
                ep.setFrequency((long)currNode.get("instances").size());
                //attempt to get resolutions if they exist
                JsonNode resolutionNode = currNode.get("resolutions");
                if ( null != resolutionNode )
                {
                  //resolution nodes are arrays
                  JsonNode resolutionFirst = resolutionNode.get(0);
                  ep.setSemanticLinks(new ArrayList<String>());
                  ep.getSemanticLinks().add(resolutionFirst.get("id").getTextValue()); //this is a link to an alchemy page
                  ep.setDisambiguatedName(resolutionFirst.get("name").getTextValue());
                  //check if we need to create a geo object
                  if ( null != resolutionFirst.get("latitude") )
                  {
                    GeoPojo gp = new GeoPojo();
                    String lat = resolutionFirst.get("latitude").getValueAsText();
                    String lon = resolutionFirst.get("longitude").getValueAsText();
                    gp.lat = Double.parseDouble(lat);
                    gp.lon = Double.parseDouble(lon);
                    ep.setGeotag(gp);
                 
                }
                else {
                  ep.setDisambiguatedName(name); // use actual name)                 
                }
                entityNameMap.put(currNodeName.toLowerCase(), ep);
                entities.add(ep);
              }
              catch (Exception ex)
              {
                logger.error("Error creating event pojo from OpenCalaisNode: " + ex.getMessage(), ex);
              }
            }
            else if ( typeGroup.equals("relations") )
            {             
              eventNodes.add(currNode);           
            }
          }         
        }
        //handle events
        if (bAddRawEventsToMetadata) {
          // For now just re-process these into DB objects since we know that works...
          rawEventObjects = new BasicDBList();
        }
        for ( JsonNode eventNode : eventNodes )
        {         
          AssociationPojo event = parseEvent(eventNode);
          //remove useless events (an event is useless if it only has a verb (guessing currently)
          if ( null != event )
          {
            event = removeUselessEvents(event);
            if ( null != event )
            {
              events.add(event);
            }
          }
          if (bAddRawEventsToMetadata) {
            BasicDBObject eventDbo = (BasicDBObject) com.mongodb.util.JSON.parse(eventNode.toString());
            if (null != eventDbo) {
              BasicDBObject transformObj = new BasicDBObject();
              for (Map.Entry<String, Object> entries: eventDbo.entrySet()) {
                if (entries.getValue() instanceof String) {
                  String val = (String) entries.getValue();
                  EntityPojo transformVal = findMappedEntityName(val);
                  if (null != transformVal) {
                    transformObj.put(entries.getKey(), transformVal.getIndex());                   
                    transformObj.put(entries.getKey() + "__hash", val);                   
                  }
                  else {
                    transformObj.put(entries.getKey(), val);                   
                  }
                }
                else {
                  transformObj.put(entries.getKey(), entries.getValue());
                }
              }
             
              // (add to another list, which will get written to metadata)
              rawEventObjects.add(transformObj);
            }
          }
        }
        if (bAddRawEventsToMetadata) {
          partialDoc.addToMetadata("OpenCalaisEvents", rawEventObjects.toArray());
        }
        if (null != partialDoc.getEntities()) {
          partialDoc.getEntities().addAll(entities);
          partialDoc.setEntities(partialDoc.getEntities());
        }
        else if (null != entities) {
          partialDoc.setEntities(entities);
        }
        if (null != partialDoc.getAssociations()) {
          partialDoc.getAssociations().addAll(events);
          partialDoc.setAssociations(partialDoc.getAssociations());
        }
        else if (null != events) {
          partialDoc.setAssociations(events);
        }
      }
      else // Error back from OC, presumably the input doc is malformed/too long
      {
        throw new InfiniteEnums.ExtractorDocumentLevelException("OpenCalais HTTP error code: " + Integer.toString(responseCode));
      }
    }
    catch (Exception e)
    {   
      //DEBUG
      //e.printStackTrace();
      logger.debug("OpenCalais", e);
      //there was an error, so we return null instead
      throw new InfiniteEnums.ExtractorDocumentLevelException(e.getMessage());
    }
  }

  /**
   * Removes useless events by returning null so they
   * do not get saved
   *
   * Current strategy, if only a verb exists, remove this event
   *
   * @param event The eventpojo to check if its useless
   * @return Null if event is useless, otherwise the event
   */
  private AssociationPojo removeUselessEvents(AssociationPojo event)
  {
    if (   event.getVerb() != null &&
        event.getEntity1() == null &&
        event.getEntity2() == null &&
        event.getTime_start() == null &&
        event.getGeo_index() == null )
      return null;
    return event;
  }

  @Override
  public void extractEntitiesAndText(DocumentPojo partialDoc)
      throws ExtractorDailyLimitExceededException,
      ExtractorDocumentLevelException
  {
    throw new RuntimeException("You must have a textEngine or text object in front of this featureEngine.");
  }

  /**
   * Attempts to lookup if this extractor has a given capability,
   * if it does returns value, otherwise null
   *
   * @param capability Extractor capability we are looking for
   * @return Value of capability, or null if capability not found
   */
  @Override
  public String getCapability(EntityExtractorEnum capability)
  {
    return _capabilities.get(capability);
  }
 
  //_______________________________________________________________________
  //_____________________________UTILITY FUNCTIONS_________________________
  //_______________________________________________________________________
 
  private PostMethod createPostMethod(String text) throws UnsupportedEncodingException {

    if (text.length() > MAX_LENGTH) {
      text = text.substring(0, MAX_LENGTH);
    }   
        PostMethod method = new PostMethod(CALAIS_URL);

        // Set mandatory parameters
        method.setRequestHeader("x-calais-licenseID", CALAIS_LICENSE.trim());
       
        // Set input content type
        method.setRequestHeader("Content-Type", "text/raw; charset=UTF-8");

    // Set response/output format
        method.setRequestHeader("Accept", "application/json");

        method.setRequestHeader("enableMetadataType","GenericRelations");
        // Enable Social Tags processing
        method.setRequestEntity(new StringRequestEntity(text,"text/plain","UTF-8"));
        return method;
    }
 
  /**
   * Checks if the entity is in our map and returns
   * its value if so, otherwise just returns this entity.
   *
   * This is used for when OpenCalais references an entity in the form of
   * http://s.opencalais.com/hash so we can get back an actual name like Obama
   *
   * @param entity The entity that could potentially be a hash
   * @return The unhashed entity, just a string name
   */
  private EntityPojo findMappedEntityName(String entity)
  {
    if ( entityNameMap.containsKey(entity) )
      return entityNameMap.get(entity);
    else
    {
      //Here we create a fake pojo to return so it will just use
      //the text given (could return null and do a check but
      //requires a lot of extra code
      /*EntityPojo fakeEP = new EntityPojo();
      fakeEP.disambiguous_name = entity;
      fakeEP.actual_name = entity;
      return fakeEP;*/
      return null;
    }
  }
 
  /**
   * Parses the entity type into the correct noun verb noun columns
   *
   *
   * @param nodename
   * @param current_node
   * @return
   */
  public AssociationPojo parseEvent(JsonNode current_node)
  {
    AssociationPojo ep = null;
    //handle the different types on entities
    String entity_type = current_node.get("_type").getTextValue().toLowerCase();
    String curr_ent;
    //find eventschema for this type if one exists
    EventSchemaPojo esp = eventSchemas.get(entity_type);
    if ( esp != null )
    {
      ep = new AssociationPojo();
      //entity 1
      if ( null != esp.entity1column && null != current_node.get(esp.entity1column) )
      {
        JsonNode ent1node = current_node.get(esp.entity1column);
        if ( ent1node.isArray() )
        {
          Iterator<JsonNode> entiter = ent1node.getElements();
          curr_ent = entiter.next().getTextValue().toLowerCase();
          EntityPojo matchEnt1 = findMappedEntityName(curr_ent);
          if ( null != matchEnt1)
          {
            ep.setEntity1(matchEnt1.getActual_name());
            ep.setEntity1_index(createEntityIndex(matchEnt1));
            if ( ep.getGeotag() == null && matchEnt1.getGeotag() != null) //try to set geotag if it already hasn't been
              ep.setGeotag(matchEnt1.getGeotag().deepCopy());
          }
          else
            ep.setEntity1(curr_ent);         
         
          if ( entiter.hasNext())
          {
            curr_ent = entiter.next().getTextValue().toLowerCase();
            EntityPojo matchEnt12 = findMappedEntityName(curr_ent);
            if ( null != matchEnt12 )
            {
              ep.setEntity2(matchEnt12.getActual_name());
              ep.setEntity2_index(createEntityIndex(matchEnt12));
              if ( ep.getGeotag() == null && matchEnt12.getGeotag() != null) //try to set geotag if it already hasn't been
                ep.setGeotag(matchEnt12.getGeotag().deepCopy());
            }
            else
              ep.setEntity2(curr_ent);           
          }
        }
        else
        {
          curr_ent = current_node.get(esp.entity1column).getTextValue().toLowerCase();
          EntityPojo matchEnt1Only = findMappedEntityName(curr_ent);
          if ( null != matchEnt1Only )
          {
            ep.setEntity1(matchEnt1Only.getActual_name());
            ep.setEntity1_index(createEntityIndex(matchEnt1Only));
            if ( ep.getGeotag() == null && matchEnt1Only.getGeotag() != null ) //try to set geotag if it already hasn't been
              ep.setGeotag(matchEnt1Only.getGeotag().deepCopy());
          }
          else
            ep.setEntity1(curr_ent);         
        }
      }     
      //entity 2     
      if ( null != esp.entity2column && null != current_node.get(esp.entity2column)  )
      {
        JsonNode ent2node = current_node.get(esp.entity2column);
        if ( ent2node.isTextual() )
        {
          curr_ent = current_node.get(esp.entity2column).getTextValue().toLowerCase();
          EntityPojo matchEnt2 = findMappedEntityName(curr_ent);
          if ( null != matchEnt2 )
          {
            ep.setEntity2(matchEnt2.getActual_name());
            ep.setEntity2_index(createEntityIndex(matchEnt2));
            if ( ep.getGeotag() == null && matchEnt2.getGeotag() != null ) //try to set geotag if it already hasn't been
              ep.setGeotag(matchEnt2.getGeotag().deepCopy());
          }
          else
            ep.setEntity2(curr_ent);
        }
      }
      //verb and verb category (if there is a verb cat, assign that and then get column value)
      if ( null != esp.verbcategory )
      {
        ep.setVerb_category(esp.verbcategory);
       
        if ( null != esp.verbcolumn && null != current_node.get(esp.verbcolumn) )
        {
          JsonNode verbnode = current_node.get(esp.verbcolumn);
          if ( verbnode.isTextual() )
          {
            ep.setVerb(current_node.get(esp.verbcolumn).getTextValue().toLowerCase());
            EntityPojo verbent = findMappedEntityName(ep.getVerb());
            if ( verbent != null )
              ep.setVerb(verbent.getActual_name());
          }
        }
      }
      else if ( null != esp.verbcolumn && null != current_node.get(esp.verbcolumn) )
      {
        ep.setVerb(current_node.get(esp.verbcolumn).getTextValue().toLowerCase());
      }
      //location
      if ( null != esp.locationcolumn && null != current_node.get(esp.locationcolumn) )
      {
        curr_ent = current_node.get(esp.locationcolumn).getTextValue().toLowerCase();
        EntityPojo geoEnt = findMappedEntityName(curr_ent);
        if ( geoEnt != null && geoEnt.getGeotag() != null )
        {
          ep.setGeo_index(createEntityIndex(geoEnt));       
          ep.setGeotag(geoEnt.getGeotag().deepCopy()); //location always over-rides geotag location
        }
      }     
      //time
      if ( null != esp.timecolumnstart && null != current_node.get(esp.timecolumnstart) )
      {
        curr_ent = current_node.get(esp.timecolumnstart).getTextValue().toLowerCase();       
        if ( null != curr_ent )
        {
          ep.setTime_start(standardizeTime(curr_ent));
          //System.out.println(current_node);
          //add some time parsing to get ranges if possible 
          if ( null != esp.timecolumnend && null != current_node.get(esp.timecolumnend) )
          {
            curr_ent = current_node.get(esp.timecolumnend).getTextValue().toLowerCase();
            String[] times = new String[2];
            times[0] = ep.getTime_start();
            times[1] = curr_ent;
            parseEndDate(times);
            ep.setTime_start(times[0]);
            ep.setTime_end(times[1]);
          }
        }
      }
      //remove geotag if it does not have loc
      if ( ep.getGeotag() != null && ep.getGeotag().lon == null)
        ep.setGeotag(null);
      ep.setAssociation_type(getEventType(ep));
    }
    else
    {
      // It's OK just to use the log for this, at some point could consider passing in HarvestContext
      // so could use the per source logger
      logger.info("OpenCalais extractor does not have an event_schema for: " + entity_type);
    }
    return ep;
  }
 
  /**
   * Modifies both the time start and time end variables to create time ranges
   * when possible.
   *
   * Takes a 2 String array [ timestart, timeend] so that it can be passed by
   * refence and therefore both items  can be modified
   *
   * @param times 2 String array consisting of index 0 = timestart and index 1 = timeend
   */
  private void parseEndDate(String[] times)
  {
    String time_start = times[0];
    String time_end = times[1];
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss");
    int num_time_end = 0;
    try
    {
      num_time_end = Integer.parseInt(time_end);
    }
    catch (Exception ex)
    {
      num_time_end = 0;
    }
   
    try
    {
      if ( num_time_end != 0 && time_end.length() == 4 ) //CASE 1: 2004 (just a year)
      {
        //just a year, span from jan 1 to dec 31
        Calendar cal = Calendar.getInstance();
        cal.set(num_time_end, 0, 1);
        Date datestart = cal.getTime();
        cal.set(num_time_end, 11,31);
        Date dateend = cal.getTime();
        time_start = sdf.format(datestart);
        time_end = sdf.format(dateend);
      }   
      else if ( time_end.substring(0,2).toLowerCase().equals("in") ) //CASE 2: in 2004 (in year) OR in May (in month)
      {
        try
        {
          //pull out year and span from jan1 to dec 31
          num_time_end = Integer.parseInt(time_end.substring(3,7));
          Calendar cal = Calendar.getInstance();
          cal.set(num_time_end, 0, 1);
          Date datestart = cal.getTime();
          cal.set(num_time_end, 11,31);
          Date dateend = cal.getTime();
          time_start = sdf.format(datestart);
          time_end = sdf.format(dateend);
        }
        catch(Exception ex)
        {
          //was not a year, try a month
          String monthString = time_end.substring(3);
          int monthint = parseMonth(monthString);
          if ( monthint > -1 )
          {
            Calendar cal = Calendar.getInstance();
            cal.set(num_time_end, monthint, 1);
            Date datestart = cal.getTime();
            cal.set(num_time_end, monthint,cal.getActualMaximum(Calendar.DATE));
            Date dateend = cal.getTime();
            time_start = sdf.format(datestart);
            time_end = sdf.format(dateend);
          }
          else
          {
            time_end = null;
          }         
        }
      }
      else if ( time_end.substring(0,4).toLowerCase().equals("last") ) //CASE 3: last june
      {
        String monthString = time_end.substring(5);
        int monthint = parseMonth(monthString);
        if ( monthint > -1 )
        {
          Calendar cal = Calendar.getInstance();
          num_time_end = cal.get(Calendar.YEAR)-1;
          cal.set(num_time_end, monthint, 1);
          Date datestart = cal.getTime();
          cal.set(num_time_end, monthint,cal.getActualMaximum(Calendar.DATE));
          Date dateend = cal.getTime();
          time_start = sdf.format(datestart);
          time_end = sdf.format(dateend);
        }
        else
        {
          time_end = null;
        }   
      }
      else if ( time_end.split(" ").length == 2 ) //CASE 4: June 2004 (month and year)
      {
        String[] parts = time_end.split(" ");
        //try to get month
        int monthint = parseMonth(parts[0]);
        if ( monthint > -1 )
        {
          try
          {
            num_time_end = Integer.parseInt(parts[1]);
            Calendar cal = Calendar.getInstance();
            cal.set(num_time_end, monthint, 1);
            Date datestart = cal.getTime();
            cal.set(num_time_end, monthint,cal.getActualMaximum(Calendar.DATE));
            Date dateend = cal.getTime();
            time_start = sdf.format(datestart);
            time_end = sdf.format(dateend);
          }
          catch (Exception ex)
          {
            num_time_end = 0;
          }
        }
        else
        {
          time_end = null;
        }
      }     
      else //didn't fall into one of our cases, we either dont need to parse or dont know how so null out
      {
        time_end = null;
      }
    }
    catch (Exception ex)
    {
      //we had some sort of error, null out the end date, and leave start date whatever open calais extracted
      time_end = null;
    }
   
   
    //System.out.println(time_start + " to " + time_end);
    times[0] = time_start;
    times[1] = time_end;
  }
 
  /**
   * Returns an integer for the month given from 0(january) to 11(december)
   * returns -1 if no match is found
   *
   * @param month string full name of month, e.g. January,may,JULY
   * @return 0-11 for jan-dec or -1 on error
   */
  private int parseMonth(String month)
  {
    month = month.toLowerCase();
    if ( month.equals("january") )
      return 0;
    else if ( month.equals("february"))
      return 1;
    else if ( month.equals("march"))
      return 2;
    else if ( month.equals("april"))
      return 3;
    else if ( month.equals("may"))
      return 4;
    else if ( month.equals("june"))
      return 5;
    else if ( month.equals("july"))
      return 6;
    else if ( month.equals("august"))
      return 7;
    else if ( month.equals("september"))
      return 8;
    else if ( month.equals("october"))
      return 9;
    else if ( month.equals("november"))
      return 10;
    else if ( month.equals("december"))
      return 11;
    else return -1;
  }
 
  /**
   * OpenCalais dates are in the format (yyyy-mm-dd)
   * convert to yyyy-mm-dd?
   *
   * @param date
   * @return
   */
  private String standardizeTime(String date)
  {
    //attempt 1 try to convert yyyy-mm-dd
    try
    {
      SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
      Date parsedDate = sdf.parse(date);
      SimpleDateFormat sdfEnd = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
      return sdfEnd.format(parsedDate);
     
    }
    catch (Exception ex)
    {
      //error converting opencalais date
      //logger.info("Could not extract correct dateformat for: " + date);     
    }
    //attempt 2 try to convert yyyy
    try
    {
      SimpleDateFormat sdf = new SimpleDateFormat("yyyy");
      Date parsedDate = sdf.parse(date);
      SimpleDateFormat sdfEnd = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
      return sdfEnd.format(parsedDate);
     
    }
    catch (Exception ex)
    {
      //error converting opencalais date
      //logger.info("Could not extract correct dateformat for: " + date);
     
    }
    return new StringBuffer("(").append(date).append(")").toString(); //just return what they gave us if all fails
  }
 
  /**
   * Return the type of event based on following criteria,
   * event can be either Event, Fact, or Summary
   *
   *  Event: Must contain atleast 2 disambigous entities
   *  Fact: Generic Relation
   *  Summary: Anything else
   *
   * @param evt
   * @return
   */
  private String getEventType(AssociationPojo evt)
  {
    //count disambig ents
    int disambig_count = 0;
    if ( evt.getEntity1_index() != null ) disambig_count++;
    if ( evt.getEntity2_index() != null ) disambig_count++;
    if ( evt.getGeo_index() != null ) disambig_count++;
   
    String sEventOrFact = factOrEvent.get(evt.getVerb_category());
    if (null == sEventOrFact) { // (defaults to event)
      sEventOrFact = "Event";
    }
    if ( disambig_count > 1 )
      return sEventOrFact;
    else
      return "Summary";
  }
 
  /**
   * Creates the entity gazateer entry if one exists
   * for the current entity.  We have to do this because
   * the entity has not yet been added to the gaz and therefore will not have
   * one otherwise
   *
   * @param ent
   * @return
   */
  private String createEntityIndex(EntityPojo ent)
  {
    if ( ent.getType() != null )
      return new StringBuffer(ent.getDisambiguatedName().toLowerCase()).append('/').append(ent.getType().toLowerCase()).toString();
    else
      return ent.getDisambiguatedName();
  }
 
  /**
   * Read in xml file and save schema examples
   *
   * @return A list of schemas that we can turn into events from open calais
   */
  private Map<String,EventSchemaPojo> loadEventSchemas()
  {
    Map<String, EventSchemaPojo> schemas = new HashMap<String,EventSchemaPojo>();   
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    try
    {
      File file = new File(Globals.getConfigLocation()+"/event_schema.xml");
      DocumentBuilder db = dbf.newDocumentBuilder();
      Document doc = db.parse(file);
      doc.getDocumentElement().normalize();
      NodeList nodelist = doc.getElementsByTagName("event");
      for ( int i = 0; i < nodelist.getLength(); i++ )
      {
        EventSchemaPojo esp = new EventSchemaPojo();
        Node node = nodelist.item(i);
        NodeList children = node.getChildNodes();
        for ( int j = 0; j < children.getLength(); j++)
        {
          Node child = children.item(j);
          String name = child.getNodeName();
          // (note getNodeValue can be null, so can only be referenced in one of the if blocks below)
         
          if ( name.equals("eventtype"))
            esp.eventtype = child.getChildNodes().item(0).getNodeValue();
          else if ( name.equals("entity1column"))
            esp.entity1column = child.getChildNodes().item(0).getNodeValue();
          else if ( name.equals("verbcolumn"))
            esp.verbcolumn = child.getChildNodes().item(0).getNodeValue();
          else if ( name.equals("verbcategory"))
            esp.verbcategory = child.getChildNodes().item(0).getNodeValue();
          else if ( name.equals("entity2column"))
            esp.entity2column = child.getChildNodes().item(0).getNodeValue();
          else if ( name.equals("locationcolumn"))
            esp.locationcolumn = child.getChildNodes().item(0).getNodeValue();
          else if ( name.equals("timecolumnstart"))
            esp.timecolumnstart = child.getChildNodes().item(0).getNodeValue();
          else if ( name.equals("timecolumnend"))
            esp.timecolumnend = child.getChildNodes().item(0).getNodeValue();
          else if ( name.equals("metatype")) {
            factOrEvent.put(esp.verbcategory, child.getChildNodes().item(0).getNodeValue());
          }           
        }
        schemas.put(esp.eventtype, esp);
      }
    }
    catch (Exception ex)
    {
      logger.error(ex.getMessage());
      ex.printStackTrace();
    }
    return schemas;
  }
 
  class ShutdownHook extends Thread
  {
      public void run()
      {
        if ((null != num_extraction_requests) && (null != num_extraction_collisions)) {
          if ((num_extraction_requests.get() > 0) || (num_extraction_collisions.get() > 0)){
            StringBuilder sb = new StringBuilder();
            sb.append("OpenCalais runtime report: ");
          sb.append("num_of_extraction_requests=" + num_extraction_requests.get());
          sb.append(" num_of_extraction_collisions=" + num_extraction_collisions.get());
          logger.info(sb.toString());
          }
        }
        // (did see a null ptr exception here, not clear how it happens - ie ^^^ for robustness)
      }
  }
}
TOP

Related Classes of com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais.ExtractorOpenCalais$ShutdownHook

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.