/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package ivory.ltr;
import ivory.core.RetrievalEnvironment;
import ivory.core.exception.ConfigurationException;
import ivory.smrf.model.Clique;
import ivory.smrf.model.DocumentNode;
import ivory.smrf.model.GraphNode;
import ivory.smrf.model.MarkovRandomField;
import ivory.smrf.model.builder.MRFBuilder;
import ivory.smrf.model.importance.ConceptImportanceModel;
import ivory.smrf.model.importance.LinearImportanceModel;
import ivory.smrf.model.importance.MetaFeature;
import ivory.smrf.retrieval.BatchQueryRunner;
import java.io.IOException;
import java.rmi.NotBoundException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Map.Entry;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import edu.umd.cloud9.collection.DocnoMapping;
/**
* @author Don Metzler
*
*/
public class ExtractFeatures {
private static final double DEFAULT_FEATURE_VALUE = 0.0; // default feature value
private static final String QUERY_FEATURE_NAME = "qid"; // query id feature name
private static final String DOC_FEATURE_NAME = "docid"; // document id feature name
private static final String JUDGMENT_FEATURE_NAME = "grade"; // relevance grade feature name
private BatchQueryRunner runner = null; // batch query runner
private RetrievalEnvironment env = null; // retrieval environment
private Map<String, String> queries = null; // query id -> query text mapping
private DocnoMapping docnoMapping = null; // docno mapping
public ExtractFeatures(String [] args, FileSystem fs) throws SAXException, IOException, ParserConfigurationException, NotBoundException, Exception {
loadQueryRunner(args, fs);
env = runner.getRetrievalEnvironment();
queries = runner.getQueries();
docnoMapping = env.getDocnoMapping();
Map<String, String> finalQueries = new HashMap<String, String>();
for(Entry<String, String> queryEntry : queries.entrySet()) {
String queryKey = queryEntry.getKey();
String queryText = queryEntry.getValue();
String finalQuery = "";
String[] parts = env.tokenize(queryText);
for(String part: parts) {
if(env.getPostingsList(part) != null) {
finalQuery += part + " ";
}
}
finalQuery = finalQuery.trim();
if(finalQuery.length() != 0) {
finalQueries.put(queryKey, finalQuery);
}
}
queries = finalQueries;
}
public void loadQueryRunner(String [] args, FileSystem fs) throws ConfigurationException{
runner = new BatchQueryRunner(args, fs);
}
private void extract() throws Exception {
// models specified in parameter files
Set<String> modelNames = runner.getModels();
// feature importance models
Collection<ConceptImportanceModel> importanceModels = env.getImportanceModels();
// we only know how to deal with linear importance models, so filter the rest out
List<LinearImportanceModel> linearImportanceModels = new ArrayList<LinearImportanceModel>();
for(ConceptImportanceModel model : importanceModels) {
if(model instanceof LinearImportanceModel) {
linearImportanceModels.add((LinearImportanceModel)model);
}
}
SortedSet<String> featureNames = new TreeSet<String>();
for(Entry<String, String> queryEntry : queries.entrySet()) {
// query text
String queryText = queryEntry.getValue();
// compute features for each model
for(String modelName : modelNames) {
// build mrf from model node
Node modelNode = runner.getModel(modelName);
MRFBuilder builder = MRFBuilder.get(env, modelNode);
MarkovRandomField mrf = builder.buildMRF(queryText.split("\\s+"));
// get mrf cliques
List<Clique> cliques = mrf.getCliques();
// add parameter name to feature name set
for(Clique c : cliques) {
// parameter id
String paramId = c.getParameter().getName();
// handle linear importance model weights
if(importanceModels.size() != 0) {
for(LinearImportanceModel model : linearImportanceModels) {
List<MetaFeature> metaFeatures = model.getMetaFeatures();
for(MetaFeature metaFeat : metaFeatures) {
// feature id = modelName-metaFeatId-paramId
String featId = modelName + "-" + metaFeat.getName() + "-" + paramId;
featureNames.add(featId);
}
}
}
// feature id = modelName-paramId
String featId = modelName + "-" + paramId;
featureNames.add(featId);
}
}
}
// add judgment feature name
featureNames.add(JUDGMENT_FEATURE_NAME);
// print feature name header
System.out.print(QUERY_FEATURE_NAME + "\t" + DOC_FEATURE_NAME);
for(String featureName : featureNames) {
System.out.print("\t" + featureName);
}
System.out.println();
// extract features query-by-query
for(Entry<String, String> queryEntry : queries.entrySet()) {
// feature map (docname -> feature name -> feature value)
SortedMap<String,SortedMap<String,Double>> featureValues = new TreeMap<String,SortedMap<String,Double>>();
// query id and text
String qid = queryEntry.getKey();
String queryText = queryEntry.getValue();
// compute features for each model
for(String modelName : modelNames) {
// build mrf from model node
Node modelNode = runner.getModel(modelName);
MRFBuilder builder = MRFBuilder.get(env, modelNode);
MarkovRandomField mrf = builder.buildMRF(queryText.split("\\s+"));
// initialize mrf
mrf.initialize();
// get mrf cliques
List<Clique> cliques = mrf.getCliques();
// get docnodes associated with mrf
ArrayList<DocumentNode> docNodes = new ArrayList<DocumentNode>();
List<GraphNode> nodes = mrf.getNodes();
for (GraphNode node : nodes) {
if (node instanceof DocumentNode) {
docNodes.add((DocumentNode) node);
}
}
// get document set to extract features for
Map<String,Double> origJudgments = runner.getJudgmentSet(qid);
if(origJudgments == null) {
System.err.println("Warning: no judgments found for qid = " + qid + " -- skipping!");
continue;
}
// convert to docid -> judgment mapping
SortedMap<Integer,Double> judgments = new TreeMap<Integer,Double>();
Map<Integer,String> docIdToNameMap = new HashMap<Integer,String>();
for(Entry<String,Double> judgmentEntry : origJudgments.entrySet()) {
// document name
String docName = judgmentEntry.getKey();
// judgment
double judgment = judgmentEntry.getValue();
// doc id
int docid = docnoMapping.getDocno(docName);
// update maps
judgments.put(docid, judgment);
docIdToNameMap.put(docid, docName);
}
for(Entry<Integer,Double> judgmentEntry : judgments.entrySet()) {
// document id
int docid = judgmentEntry.getKey();
// document name
String docName = docIdToNameMap.get(docid);
// get feature map for this docname
SortedMap<String,Double> docFeatures = featureValues.get(docName);
if(docFeatures == null) {
docFeatures = new TreeMap<String,Double>();
featureValues.put(docName, docFeatures);
}
// document judgment
double judgment = judgmentEntry.getValue();
// set judgment feature
docFeatures.put(JUDGMENT_FEATURE_NAME, judgment);
// initialize doc nodes
for(DocumentNode node : docNodes) {
node.setDocno(docid);
}
// compute potentials for each clique
for(Clique c : cliques) {
// parameter id
String paramId = c.getParameter().getName();
// handle linear importance model weights (for everything except query-independent clique types)
if(importanceModels.size() != 0 && c.getType() != Clique.Type.Document) {
for(LinearImportanceModel model : linearImportanceModels) {
List<MetaFeature> metaFeatures = model.getMetaFeatures();
for(MetaFeature metaFeat : metaFeatures) {
// feature id = modelName-metaFeatId-paramId
String featId = modelName + "-" + metaFeat.getName() + "-" + paramId;
// score = meta-feature weight * (raw) clique potential
double score = model.computeFeatureValue(c.getConcept(), metaFeat) * c.getPotential();
// update feature values
Double curVal = docFeatures.get(featId);
if(curVal == null) {
docFeatures.put(featId, score);
}
else {
docFeatures.put(featId, curVal + score);
}
}
}
}
// feature id = modelName-paramId
String featId = modelName + "-" + paramId;
// score = (raw) clique potential
double score = c.getPotential();
// update feature values
Double curVal = docFeatures.get(featId);
if(curVal == null) {
docFeatures.put(featId, score);
}
else {
docFeatures.put(featId, curVal + score);
}
}
}
}
// print feature values for current query
for(Entry<String, SortedMap<String, Double>> featureEntry : featureValues.entrySet()) {
String docName = featureEntry.getKey();
System.out.print(qid + "\t" + docName);
Map<String,Double> docFeatures = featureEntry.getValue();
for(String featureName : featureNames) {
Double featVal = docFeatures.get(featureName);
if(featVal == null) {
featVal = DEFAULT_FEATURE_VALUE;
}
System.out.print("\t" + featVal);
}
System.out.println();
}
}
}
public static void main(String[] args) throws SAXException, ParserConfigurationException, NotBoundException, Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
ExtractFeatures extractor = new ExtractFeatures(args, fs);
extractor.extract();
}
}