Source Code of org.apache.nutch.indexer.field.CustomFields

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.indexer.field;


import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.Set;


import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;


/**
 * Creates custom FieldWritable objects from a text file containing field
 * information including field name, value, and optional boost and fields type
 * (as needed by FieldWritable objects).
 * 
 * An input text file to CustomFields would be tab separated and would look
 * similar to this:
 * 
 * <pre> 
 * http://www.apache.org\tlang\ten\t5.0\tCONTENT
 * http://lucene.apache.org\tlang\tde
 * </pre>
 * 
 * The only required fields are url, name and value. Custom fields are
 * configured through the custom-fields.xml file in the classpath. The config
 * file allow you to set defaults for whether a field is indexed, stored, and
 * tokenized, boosts on a field, and whether a field can output multiple values
 * under the same key.
 * 
 * The purpose of the CustomFields job is to allow better integration with
 * technologies such as Hadoop Streaming. Streaming jobs can be created in any
 * programming language, can output the text file needed by the CustomFields
 * job, and those fields can then be included in the index.
 * 
 * The concept of custom fields requires two separate pieces. The indexing piece
 * and the query piece. The indexing piece is handled by the CustomFields job.
 * The query piece is handled by the query-custom plugin.
 * 
 * <b>Important:</b><br> <i>Currently, because of the way the query plugin
 * architecture works, custom fields names must be added to the fields parameter
 * in the query-custom plugin plugin.xml file in order to be queried.</i>
 * 
 * The CustomFields tool accepts one or more directories containing text files
 * in the appropriate custom field format. These files are then turned into
 * FieldWritable objects to be included in the index.
 */
public class CustomFields
  extends Configured
  implements Tool {


  public static final Log LOG = LogFactory.getLog(CustomFields.class);


  /**
   * MapReduce job that converts text values into FieldWritable objects.
   * 
   * @param inputs The directories with text files to convert.
   * @param output The converter output directory.
   * 
   * @throws IOException If an error occurs while converting.
   */
  private void runConverter(Path[] inputs, Path output)
    throws IOException {


    JobConf converter = new NutchJob(getConf());
    converter.setJobName("CustomFields Converter");
    for (int i = 0; i < inputs.length; i++) {
      FileInputFormat.addInputPath(converter, inputs[i]);
    }
    FileOutputFormat.setOutputPath(converter, output);
    converter.setInputFormat(TextInputFormat.class);
    converter.setMapperClass(Converter.class);
    converter.setReducerClass(Converter.class);
    converter.setMapOutputKeyClass(Text.class);
    converter.setMapOutputValueClass(FieldWritable.class);
    converter.setOutputKeyClass(Text.class);
    converter.setOutputValueClass(FieldWritable.class);
    converter.setOutputFormat(SequenceFileOutputFormat.class);


    LOG.info("Starting converter job");
    try {
      JobClient.runJob(converter);
    }
    catch (IOException e) {
      LOG.error(StringUtils.stringifyException(e));
      throw e;
    }
    LOG.info("Finished converter job.");
  }


  /**
   * Aggregated multiple FieldWritable objects with the same name. Depending on
   * settings in the custom-fields.xml file, a field may one or more fields.
   * This jobs aggregates fields and then collects based on the configuration
   * settings.
   * 
   * @param basicFields The basicfields FieldWritable objects.
   * @param converted The converted custom field objects.
   * @param output The final output directory for custom field objects.
   * 
   * @throws IOException If an error occurs while converting.
   */
  private void runCollector(Path basicFields, Path converted, Path output)
    throws IOException {


    JobConf collector = new NutchJob(getConf());
    collector.setJobName("CustomFields Collector");
    FileInputFormat.addInputPath(collector, converted);
    FileInputFormat.addInputPath(collector, basicFields);
    FileOutputFormat.setOutputPath(collector, output);
    collector.setInputFormat(SequenceFileInputFormat.class);
    collector.setMapOutputKeyClass(Text.class);
    collector.setMapOutputValueClass(ObjectWritable.class);
    collector.setMapperClass(Collector.class);
    collector.setReducerClass(Collector.class);
    collector.setOutputKeyClass(Text.class);
    collector.setOutputValueClass(FieldWritable.class);
    collector.setOutputFormat(SequenceFileOutputFormat.class);


    LOG.info("Starting collector job");
    try {
      JobClient.runJob(collector);
    }
    catch (IOException e) {
      LOG.error(StringUtils.stringifyException(e));
      throw e;
    }
    LOG.info("Finished collector job.");
  }


  /**
   * Converts text values into FieldWritable objects.
   */
  public static class Converter
    extends Configured
    implements Mapper<LongWritable, Text, Text, FieldWritable>,
    Reducer<Text, FieldWritable, Text, FieldWritable> {


    private JobConf conf;
    private Map<String, boolean[]> flagMap = new HashMap<String, boolean[]>();
    private Set<String> multiFields = new HashSet<String>();


    public Converter() {
    }


    public Converter(Configuration conf) {
      setConf(conf);
    }


    public void configure(JobConf conf) {


      try {


        // get the file system and the configuration file from the classpath
        this.conf = conf;
        FileSystem fs = FileSystem.get(conf);
        String configFile = conf.get("custom.fields.config",
          "custom-fields.xml");
        LOG.info("Reading configuration field configuration from " + configFile);
        Properties customFieldProps = new Properties();
        InputStream fis = conf.getConfResourceAsInputStream(configFile);
        if (fis == null) {
          throw new IOException("Was unable to open " + configFile);
        }


        // load the configuration file as properties
        customFieldProps.loadFromXML(fis);


        // loop through the properties setting field flags
        Enumeration propKeys = customFieldProps.keys();
        while (propKeys.hasMoreElements()) {
          String prop = (String)propKeys.nextElement();
          if (prop.endsWith(".name")) {
            String propName = prop.substring(0, prop.length() - 5);
            String name = customFieldProps.getProperty(prop);


            String indexedProp = customFieldProps.getProperty(propName
              + ".indexed");
            String storedProp = customFieldProps.getProperty(propName
              + ".stored");
            String tokProp = customFieldProps.getProperty(propName
              + ".tokenized");
            boolean indexed = (indexedProp.equalsIgnoreCase("yes")
              || indexedProp.equalsIgnoreCase("true") || indexedProp.equalsIgnoreCase("on"));
            boolean stored = (storedProp.equalsIgnoreCase("yes")
              || storedProp.equalsIgnoreCase("true") || storedProp.equalsIgnoreCase("on"));
            boolean tokenized = (tokProp.equalsIgnoreCase("yes")
              || tokProp.equalsIgnoreCase("true") || tokProp.equalsIgnoreCase("on"));
            boolean[] flags = {indexed, stored, tokenized};
            flagMap.put(name, flags);


            String multiProp = customFieldProps.getProperty(propName + ".multi");
            boolean multi = (multiProp.equalsIgnoreCase("yes")
              || multiProp.equalsIgnoreCase("true") || multiProp.equalsIgnoreCase("on"));
            if (multi) {
              multiFields.add(name);
            }
          }
        }
      }
      catch (Exception e) {
        LOG.error("Error loading custom field properties:\n"
          + StringUtils.stringifyException(e));
      }
    }


    public void map(LongWritable key, Text value,
      OutputCollector<Text, FieldWritable> output, Reporter reporter)
      throws IOException {


      // split the file on tabs
      String line = value.toString();
      String[] fields = line.split("\t");
      if (fields.length >= 3) {


        // fields must be in a specific order, default values for optional fields
        String url = fields[0];
        String fieldName = fields[1];
        String fieldVal = fields[2];
        String fieldScore = (fields.length > 3 ? fields[3] : null);
        String fieldType = (fields.length > 4 ? fields[4] : "CONTENT").toUpperCase();


        // creates the FieldWritable objects and collects
        boolean[] flags = flagMap.get(fieldName);
        if (flags != null) {
          FieldWritable field = null;
          if (fieldScore != null) {
            field = new FieldWritable(fieldName, fieldVal,
              FieldType.valueOf(fieldType), Float.parseFloat(fieldScore),
              flags[0], flags[1], flags[2]);
          }
          else {
            field = new FieldWritable(fieldName, fieldVal,
              FieldType.valueOf(fieldType), flags[0], flags[1], flags[2]);
          }
          output.collect(new Text(url), field);
        }
      }
    }


    public void reduce(Text key, Iterator<FieldWritable> values,
      OutputCollector<Text, FieldWritable> output, Reporter reporter)
      throws IOException {


      // if multiple fields are allowed collect all of them, if not allowed
      // and multiple fields are present all of the values are ignored
      Set<String> multiSet = new HashSet<String>();
      while (values.hasNext()) {
        FieldWritable field = values.next();
        String name = field.getName();
        boolean isMulti = multiFields.contains(name);
        if (isMulti || (!isMulti && !multiSet.contains(name))) {
          output.collect(key, field);
          multiSet.add(name);
        }
        else {
          LOG.info("Ignoring multiple " + name + " fields for "
            + key.toString());
        }
      }
    }


    public void close() {
    }
  }


  /**
   * Aggregates FieldWritable objects by the same name for the same URL.  These
   * objects are them filtered for multiple values against configuration 
   * settings.
   */
  public static class Collector
    extends Configured
    implements Mapper<Text, Writable, Text, ObjectWritable>,
    Reducer<Text, ObjectWritable, Text, FieldWritable> {


    private JobConf conf;


    public void configure(JobConf conf) {
      this.conf = conf;
    }


    public void close() {
    }


    public void map(Text key, Writable value,
      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
      throws IOException {


      ObjectWritable objWrite = new ObjectWritable();
      objWrite.set(value);
      output.collect(key, objWrite);
    }


    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, FieldWritable> output, Reporter reporter)
      throws IOException {


      FieldsWritable basicFields = null;
      List<FieldWritable> customFields = new ArrayList<FieldWritable>();


      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof FieldWritable) {
          customFields.add((FieldWritable)obj);
        }
        else if (obj instanceof FieldsWritable) {
          basicFields = (FieldsWritable)obj;
        }
      }


      if (basicFields != null && customFields.size() > 0) {
        for (int i = 0; i < customFields.size(); i++) {
          output.collect(key, customFields.get(i));
        }
      }
    }
  }


  void createFields(Path basicFields, Path[] inputs, Path output)
    throws IOException {


    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path tempFields = new Path(output + "-"
      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    runConverter(inputs, tempFields);
    runCollector(basicFields, tempFields, output);
    fs.delete(tempFields, true);
  }


  public static void main(String[] args)
    throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new CustomFields(),
      args);
    System.exit(res);
  }


  /**
   * Runs the CustomFields job.
   */
  public int run(String[] args)
    throws Exception {


    Options options = new Options();
    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
      "show this help message").create("help");
    Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(
      "the output index directory").create("output");
    Option inputOpts = OptionBuilder.withArgName("input").hasArgs().withDescription(
      "the input directories with text field files").create("input");
    Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArg().withDescription(
      "the basicfields to use").create("basicfields");
    options.addOption(helpOpts);
    options.addOption(inputOpts);
    options.addOption(basicFieldOpts);
    options.addOption(outputOpts);


    CommandLineParser parser = new GnuParser();
    try {


      CommandLine line = parser.parse(options, args);
      if (line.hasOption("help") || !line.hasOption("output")
        || !line.hasOption("basicfields")) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("CustomFields", options);
        return -1;
      }


      String[] inputs = line.getOptionValues("input");
      Path[] inputPaths = new Path[inputs.length];
      for (int i = 0; i < inputs.length; i++) {
        inputPaths[i] = new Path(inputs[i]);
      }
      String output = line.getOptionValue("output");
      String basicFields = line.getOptionValue("basicfields");


      createFields(new Path(basicFields), inputPaths, new Path(output));
      return 0;
    }
    catch (Exception e) {
      LOG.fatal("CustomFields: " + StringUtils.stringifyException(e));
      return -2;
    }
  }
}
Source Code of org.apache.nutch.indexer.field.CustomFields

Related Classes of org.apache.nutch.indexer.field.CustomFields