Package uk.co.flax.lucene

Source Code of uk.co.flax.lucene.IndexRemoveField$FieldRemoveReader

/**
*   Copyright (c) 2011 Lemur Consulting Ltd.
*
*   Licensed under the Apache License, Version 2.0 (the "License");
*   you may not use this file except in compliance with the License.
*   You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0

*   Unless required by applicable law or agreed to in writing, software
*   distributed under the License is distributed on an "AS IS" BASIS,
*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*   See the License for the specific language governing permissions and
*   limitations under the License.
*/

package uk.co.flax.lucene;

import java.io.File;
import java.io.IOException;
import java.util.Collection;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader.FieldOption;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;

/**
* Class for removing fields from documents in an index. When executed from
* the command line, arguments are:
*
* <source index directory>  Directory containing source index
* <target index directory>  Directory to contain target index
* <field names>             Comma separated list of fields to remove
*
* Note that terms are not changed - it is equivalent to changing the 'stored'
* property of a field from true to false.
*
* @author tomw@flax.co.uk
*/
public class IndexRemoveField {

  // sub-class of FilterIndexReader for removing document fields
  private static class FieldRemoveReader extends FilterIndexReader {

    // collection of fields to remove
    private String[] fields;
   
    // how many documents processed so far
    private int count;
   
    /**
     * Create an FieldRemoveReader, wrapping the given IndexReader to remove
     * the named fields.
     *
     * @param reader The IndexReader to wrap
     * @param fields Fields to remove
     */
    public FieldRemoveReader(IndexReader reader, String... fields) {
      super(reader);
      this.fields = fields;
      count = 0;
    }
   
    /**
     * @return how many documents have been processed so far
     */
    public int getCount() {
      return count;
    }

      @Override
      public IndexReader[] getSequentialSubReaders() {
        return null;
      }
   
    @Override
    public Document document(int n) throws CorruptIndexException, IOException {
      if (count % 10000 == 0) {
        System.out.println();
        System.out.print("Progress: " + count + " ");
      }
      if (count % 500 == 0) {
        System.out.print(".");
      }
      count++;
      return this.document(n, null);
    }
   
    @Override
    public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
      Document doc = super.document(n, fieldSelector);
      for (String field : fields) {
        doc.removeFields(field);
      }
      return doc;
    }
   
  }
 
  // report on whether fields exist in the given reader, and return result
  @SuppressWarnings("unchecked")
  private static boolean hasFields(IndexReader reader, String... fields) {
    boolean value = true;
    Collection<String> fieldNames = reader.getFieldNames(FieldOption.ALL);
    for (String field : fields) {
      if (! fieldNames.contains(field)) {
        System.out.println("Field " + field + " not recognized");
        value = false;
      }
    }
    return value;
  }
 
  /**
   * Remove fields from documents in an index.
   *
   * @param args Command line arguments
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    // parse command line arguments
    if (args.length < 3) {
      System.out.println("Usage: java uk.co.flax.IndexRemoveField <source index dir> <target index dir> <field1,field2,...>");
      return;
    }
    String sourceIndex = args[0];
    String targetIndex = args[1];
    String fields = args[2];
   
    // open an IndexReader for the source index (read-only)
    Directory sourceDir = new NIOFSDirectory(new File(sourceIndex));
    IndexReader reader = IndexReader.open(sourceDir, true);
   
    // open an IndexWriter for the target index (with a null Analyzer)
    Directory targetDir = new NIOFSDirectory(new File(targetIndex));
    //IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_31, null);
    IndexWriter writer = new IndexWriter(targetDir, null, MaxFieldLength.UNLIMITED);
   
    // add indexes from a wrapped IndexReader to the IndexWriter
    try {
      String[] fieldNames = fields.split(",");
      if (hasFields(reader, fieldNames)) {
        System.out.print("Removing fields from " + reader.numDocs() + " documents");
        FieldRemoveReader removeReader = new FieldRemoveReader(reader, fieldNames);
        writer.addIndexes(new IndexReader[] { removeReader });
        System.out.println();
        System.out.println("Complete: " + removeReader.getCount());
      }
    } finally {
      reader.close();
      writer.close();
    }
  }
}
TOP

Related Classes of uk.co.flax.lucene.IndexRemoveField$FieldRemoveReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.