Package com.atlantbh.nutch.index.alternativedataflow

Source Code of com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilter

package com.atlantbh.nutch.index.alternativedataflow;

import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;

import com.atlantbh.nutch.index.alternativedataflow.conf.AlternativeDataFlowIndexingFilterConfiguration;
import com.atlantbh.nutch.index.alternativedataflow.conf.Entry;
import com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlow;
import com.atlantbh.nutch.index.alternativedataflow.flow.DataFlow;

public class AlternativeDataFlowIndexingFilter implements IndexingFilter {

  // Constants
  private static Logger log = Logger.getLogger(AlternativeDataFlowIndexingFilter.class);

  // Configuration
  private Configuration configuration;
  private AlternativeDataFlowIndexingFilterConfiguration alternativeDataFlowConfiguration;

  // Internal data
  private boolean initialized = false;

  // **********************************
  // TEMPORARY IDEA OF CONFIGURATION
  // **********************************
  private static final Map<String, DataFlow> dataFlowMap = new HashMap<String, DataFlow>();
  static {
    dataFlowMap.put("CSV", new CsvDataFlow());

    // Call the destroy method on JVM shutdown
    Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {

      @Override
      public void run() {

        // Iterate trough the dataFlowMap and call destroy method
        for (String dataFlowId : dataFlowMap.keySet()) {

          DataFlow dataFlow = dataFlowMap.get(dataFlowId);
          dataFlow.destroy();
        }
      }
    }));
  }

  public AlternativeDataFlowIndexingFilter() {
  }

  private void initConfig() {

    // Initialize configuration
    alternativeDataFlowConfiguration = AlternativeDataFlowIndexingFilterConfiguration.getInstance(configuration);
  }

  private void initDataFlows() {

    // Initialize only once
    if (!initialized) {

      // Maps the data flow identifers with it's configuration entries
      Map<String, List<Entry>> dataFlowEntryListMap = new HashMap<String, List<Entry>>();
      for (Entry entry : alternativeDataFlowConfiguration.getEntryList()) {

        // Get or create an entry list
        List<Entry> entryList;
        if (dataFlowEntryListMap.containsKey(entry.getDataFlow())) {
          entryList = dataFlowEntryListMap.get(entry.getDataFlow());
        } else {
          entryList = new LinkedList<Entry>();
          dataFlowEntryListMap.put(entry.getDataFlow(), entryList);
        }

        entryList.add(entry);
      }

      // Iterate trough the dataFlowMap and initialize it
      for (String dataFlowId : dataFlowMap.keySet()) {

        DataFlow dataFlow = dataFlowMap.get(dataFlowId);
        List<Entry> entryList = dataFlowEntryListMap.get(dataFlowId);
        dataFlow.init(configuration, entryList);
      }

      initialized = true;
    }
  }

  @Override
  public Configuration getConf() {
    return configuration;
  }

  @Override
  public void setConf(Configuration configuration) {
    this.configuration = configuration;
    initConfig();
  }

  @Override
  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {

    // Initialize the data flows
    // Executed only the first time
    initDataFlows();

    // Iterate trough the dataFlowMap and redirect the data flow to them
    for (String dataFlowId : dataFlowMap.keySet()) {

      DataFlow dataFlow = dataFlowMap.get(dataFlowId);
      dataFlow.processData(doc, parse, url, datum, inlinks);
    }

    return doc;
  }

}
TOP

Related Classes of com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.