Package org.mediameter.cliff.util

Source Code of org.mediameter.cliff.util.MuckUtils

package org.mediameter.cliff.util;

import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.mediameter.cliff.extractor.ExtractedEntities;
import org.mediameter.cliff.extractor.OrganizationOccurrence;
import org.mediameter.cliff.extractor.PersonOccurrence;
import org.mediameter.cliff.extractor.SentenceLocationOccurrence;

import com.google.gson.Gson;

@SuppressWarnings({ "rawtypes", "unchecked" })
public class MuckUtils {

    public static ExtractedEntities entitiesFromJsonString(String nlpJsonString){
        Map sentences = sentencesFromJsonString(nlpJsonString);
        return entitiesFromSentenceMap(sentences);
    }

    public static Map sentencesFromJsonString(String nlpJsonString) {
        Gson gson = new Gson();
        Map content = gson.fromJson(nlpJsonString, Map.class);
        return content;
    }
       
    /**
     * I've overloaded "position" in each of the occurrences to be sentenceIndex
     */
    private static ExtractedEntities entitiesFromSentenceMap(Map mcSentences){
        ExtractedEntities entities = new ExtractedEntities();
        Iterator it = mcSentences.entrySet().iterator();
        while (it.hasNext()) {
            Map.Entry pairs = (Map.Entry)it.next();
            String storySentencesId = pairs.getKey().toString();
            Map corenlp = (Map) pairs.getValue();
            List<Map> nlpSentences = (List<Map>) ((Map) corenlp.get("corenlp")).get("sentences");
            for(Map sentence:nlpSentences){ // one mc sentence could be multiple corenlp sentences
                String queuedEntityText = null;
                String lastEntityType = null;
                List<Map> tokens = (List<Map>) sentence.get("tokens");
                for (Map token : tokens){
                    String entityType = (String) token.get("ne");
                    String tokenText = (String) token.get("word");
                    if(entityType.equals(lastEntityType)){
                        queuedEntityText+= " "+tokenText;
                    } else {
                        if(queuedEntityText!=null && lastEntityType!=null){
                            //TODO: figure out if we need the character index here or not
                            switch(lastEntityType){
                            case "PERSON":
                                entities.addPerson(new PersonOccurrence(queuedEntityText, 0));
                                break;
                            case "LOCATION":
                                entities.addLocation(new SentenceLocationOccurrence(queuedEntityText, storySentencesId));
                                break;
                            case "ORGANIZATION":
                                entities.addOrganization(new OrganizationOccurrence(queuedEntityText, 0));
                                break;
                            }
                        }
                        queuedEntityText = tokenText;
                    }
                    lastEntityType = entityType;
                }
            }
            it.remove(); // avoids a ConcurrentModificationException
        }
        return entities;
    }
   
}
TOP

Related Classes of org.mediameter.cliff.util.MuckUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.