Source Code of com.bericotech.clavin.index.IndexDirectoryBuilder

package com.bericotech.clavin.index;


import static java.util.concurrent.TimeUnit.MILLISECONDS;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.bericotech.clavin.gazetteer.GeoName;


/*#####################################################################
 * 
 * CLAVIN (Cartographic Location And Vicinity INdexer)
 * ---------------------------------------------------
 * 
 * Copyright (C) 2012-2013 Berico Technologies
 * http://clavin.bericotechnologies.com
 * 
 * ====================================================================
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 * 
 * ====================================================================
 * 
 * IndexDirectoryBuilder.java
 * 
 *###################################################################*/


/**
 * Builds a Lucene index of geographic entries based on
 * the GeoNames gazetteer.
 * 
 * This program is run one-time before CLAVIN can be used.
 * 
 */
public class IndexDirectoryBuilder {
  
  public final static Logger logger = LoggerFactory.getLogger(IndexDirectoryBuilder.class);
  
  // the GeoNames gazetteer file to be loaded
  static String pathToGazetteer = "./allCountries.txt";


  /**
   * Turns a GeoNames gazetteer file into a Lucene index, and adds
   * some supplementary gazetteer records at the end.
   * 
   * @param args        not used
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    
    logger.info("Indexing... please wait.");
    
    // Create a new index file on disk, allowing Lucene to choose
    // the best FSDirectory implementation given the environment.
    // TODO: delete this directory first, if it exists
    FSDirectory index = FSDirectory.open(new File("./IndexDirectory"));
    
    // indexing by lower-casing & tokenizing on whitespace
    Analyzer indexAnalyzer = new WhitespaceLowerCaseAnalyzer();
    
    // create the object that will actually build the Lucene index
    IndexWriter indexWriter = new IndexWriter(index, new IndexWriterConfig(Version.LUCENE_40, indexAnalyzer));
    
    // open the gazetteer files to be loaded
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(new File(pathToGazetteer)), "UTF-8"));
    BufferedReader r2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File("./src/main/resources/SupplementaryGazetteer.txt")), "UTF-8"));
    
    String line;
    
    // let's see how long this takes...
    Date start = new Date();
    
    // load GeoNames gazetteer into Lucene index
    
    int count = 0;
    
    while ((line = r.readLine()) != null)
      try {
        count += 1;
        if (count % 100000 == 0 ) logger.info("rowcount: " + count);
        addToIndex(indexWriter, line);
      
      } catch (Exception e) {
         logger.info("Skipping... Error on line:" + line);
           
      }


    // add supplementary gazetteer records to index
       
    while ((line = r2.readLine()) != null)
      addToIndex(indexWriter, line);
    
    // that wasn't so long, was it?
    Date stop = new Date();
    
    logger.info("[DONE]");
    logger.info(indexWriter.maxDoc() + " geonames added to index.");
    logger.info("Merging indices... please wait.");
    
    indexWriter.close();
    index.close();
    r.close();
    r2.close();
    
    logger.info("[DONE]");
    
    DateFormat df = new SimpleDateFormat("HH:mm:ss");
    long elapsed_MILLIS = stop.getTime() - start.getTime();
    logger.info("Process started: " + df.format(start) + ", ended: " + df.format(stop)
        + "; elapsed time: " + MILLISECONDS.toSeconds(elapsed_MILLIS) + " seconds.");
  }
  
  /**
   * Adds entries to the Lucene index for each unique name associated
   * with a {@link GeoName} object.
   * 
   * @param indexWriter  the object that actually builds the Lucene index
   * @param geonameEntry  single record from GeoNames gazetteer
   * @throws IOException
   */
    private static void addToIndex(IndexWriter indexWriter, String geonameEntry) throws IOException {
      
      // create a GeoName object from a single gazetteer record
      GeoName geoname = GeoName.parseFromGeoNamesRecord(geonameEntry);
      
      // add the primary (UTF-8) name for this location
      if (geoname.name.length() > 0)
        indexWriter.addDocument(buildDoc(geoname.name, geonameEntry, geoname.geonameID, geoname.population));
      
      // add the ASCII name if it's different from the primary name
      if (geoname.asciiName.length() > 0 && !geoname.asciiName.equals(geoname.name))
        indexWriter.addDocument(buildDoc(geoname.asciiName, geonameEntry, geoname.geonameID, geoname.population));
      
      // add alternate names (if any) if they differ from the primary
      // and alternate names
      for (String altName : geoname.alternateNames)
        if (altName.length() > 0 && !altName.equals(geoname.name) && !altName.equals(geoname.name))
          indexWriter.addDocument(buildDoc(altName, geonameEntry, geoname.geonameID, geoname.population));
    }
    
    /**
     * Builds a Lucene document to be added to the index based on a
     * specified name for the location and the corresponding
     * {@link GeoName} object.
     * 
     * @param name      name to serve as index key
     * @param geonameEntry  string from GeoNames gazetteer
     * @param geonameID    unique identifier (for quick look-up)
     * @param population  number of inhabitants (used for scoring)
     * @return         document to be added to the index
     */
    public static Document buildDoc(String name, String geonameEntry, int geonameID, Long population) {
      
      // in case you're wondering, yes, this is a non-standard use of
      // the Lucene Document construct
      Document doc = new Document();
      
      // this is essentially the key we'll try to match location
      // names against
      doc.add(new TextField("indexName", name, Field.Store.YES));
      
      // this is the payload we'll return when matching location
      // names to gazetteer records
      doc.add(new StoredField("geoname", geonameEntry));
      
      // TODO: use geonameID to link administrative subdivisions to
      //     each other
      doc.add(new IntField("geonameID", geonameID, Field.Store.YES));
      
      // we'll initially sort match results based on population
      doc.add(new LongField("population", population, Field.Store.YES));
      
      logger.debug("Adding to index: " + name);
      
      return doc;
    }


}
Source Code of com.bericotech.clavin.index.IndexDirectoryBuilder

Related Classes of com.bericotech.clavin.index.IndexDirectoryBuilder