Package org.apache.nutch.indexer

Source Code of org.apache.nutch.indexer.IndexMerger

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.indexer;

import java.io.*;
import java.util.*;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*;

import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;

/*************************************************************************
* IndexMerger creates an index for the output corresponding to a
* single fetcher run.
*
* @author Doug Cutting
* @author Mike Cafarella
*************************************************************************/
public class IndexMerger extends Configured implements Tool {
  public static final Log LOG = LogFactory.getLog(IndexMerger.class);

  public static final String DONE_NAME = "merge.done";

  public IndexMerger() {
   
  }
 
  public IndexMerger(Configuration conf) {
    setConf(conf);
  }
 
  /**
   * Merge all input indexes to the single output index
   */
  public void merge(Path[] indexes, Path outputIndex, Path localWorkingDir) throws IOException {
    LOG.info("merging indexes to: " + outputIndex);

    FileSystem localFs = FileSystem.getLocal(getConf())
    if (localFs.exists(localWorkingDir)) {
      localFs.delete(localWorkingDir, true);
    }
    localFs.mkdirs(localWorkingDir);

    // Get local output target
    //
    FileSystem fs = FileSystem.get(getConf());
    if (fs.exists(outputIndex)) {
      throw new FileAlreadyExistsException("Output directory " + outputIndex + " already exists!");
    }

    Path tmpLocalOutput = new Path(localWorkingDir, "merge-output");
    Path localOutput = fs.startLocalOutput(outputIndex, tmpLocalOutput);

    Directory[] dirs = new Directory[indexes.length];
    for (int i = 0; i < indexes.length; i++) {
      if (LOG.isInfoEnabled()) { LOG.info("Adding " + indexes[i]); }
      dirs[i] = new FsDirectory(fs, indexes[i], false, getConf());
    }

    //
    // Merge indices
    //
    IndexWriter writer = new IndexWriter(
        FSDirectory.open(new File(localOutput.toString())), null, true,
            MaxFieldLength.UNLIMITED);
    writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", LogMergePolicy.DEFAULT_MERGE_FACTOR));
    writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS));
    writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", LogMergePolicy.DEFAULT_MAX_MERGE_DOCS));
    writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
    writer.setInfoStream(LogUtil.getDebugStream(LOG));
    writer.setUseCompoundFile(false);
    writer.setSimilarity(new NutchSimilarity());
    writer.addIndexesNoOptimize(dirs);
    writer.optimize();
    writer.close();

    //
    // Put target back
    //
    fs.completeLocalOutput(outputIndex, tmpLocalOutput);
    LOG.info("done merging");
  }

  /**
   * Create an index for the input files in the named directory.
   */
  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new IndexMerger(), args);
    System.exit(res);
  }
 
  public int run(String[] args) throws Exception {
    String usage = "IndexMerger [-workingdir <workingdir>] outputIndex indexesDir...";
    if (args.length < 2) {
      System.err.println("Usage: " + usage);
      return -1;
    }

    //
    // Parse args, read all index directories to be processed
    //
    FileSystem fs = FileSystem.get(getConf());
    List<Path> indexDirs = new ArrayList<Path>();

    Path workDir = new Path("indexmerger-" + System.currentTimeMillis())
    int i = 0;
    if ("-workingdir".equals(args[i])) {
      i++;
      workDir = new Path(args[i++], "indexmerger-" + System.currentTimeMillis());
    }

    Path outputIndex = new Path(args[i++]);

    for (; i < args.length; i++) {
      FileStatus[] fstats = fs.listStatus(new Path(args[i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
      indexDirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(fstats)));
    }

    //
    // Merge the indices
    //

    Path[] indexFiles = (Path[])indexDirs.toArray(new Path[indexDirs.size()]);

    try {
      merge(indexFiles, outputIndex, workDir);
      return 0;
    } catch (Exception e) {
      LOG.fatal("IndexMerger: " + StringUtils.stringifyException(e));
      return -1;
    } finally {
      FileSystem.getLocal(getConf()).delete(workDir, true);
    }
  }
}
TOP

Related Classes of org.apache.nutch.indexer.IndexMerger

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.