Package org.archive.access.nutch.searcher

Source Code of org.archive.access.nutch.searcher.WaxDateClosestQueryFilter

/* WaxDateQueryFilter
*
* $Id: WaxDateQueryFilter.java 1896 2007-08-01 21:44:31Z jlee-archive $
*
* Created 06/02/2005
*
* Copyright (C) 2005 Internet Archive.
*
* This file is part of the archive-access tools project
* (http://sourceforge.net/projects/archive-access).
*
* The archive-access tools are free software; you can redistribute them and/or
* modify them under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or any
* later version.
*
* The archive-access tools are distributed in the hope that they will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
* Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License along with
* the archive-access tools; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.archive.access.nutch.searcher;

import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.TermQuery;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryException;
import org.apache.nutch.searcher.QueryFilter;
import org.apache.nutch.searcher.Query.Clause;
import org.archive.util.ArchiveUtils;

import org.apache.lucene.search.queries.PwaClosestQuery;


/**
* Based on query-more DateQueryFilter from nutch.
* Query syntax is defined as date:YYYYMMDDmmssSS, i.e. the IA 14 digit date
* format or for ranges, date:YYYY...-YYYY... where YYYY... is at least a year
* optionally followed by month, day, etc. up to the granularity of the 14
* digit IA date format. Date as date:-YYYYMMDD for a range that is from the
* start of the epoch up to and inclusive of YYYYMMDD and its inverse doesn't
* work because the nutch query parser is stripping the '-' (Its probably
* interpreting it as NOT).
*
* @author Miguel Costa
* @note handle BUG wayback 0000153
*/
public class WaxDateClosestQueryFilter implements QueryFilter
{
  public static final Log LOGGER =
    LogFactory.getLog(WaxDateQueryFilter.class.getName());

  private Configuration conf;

  private static final String FIELD_NAME = "closestdate";

  /**
   * Query syntax is defined as date:YYYYMMddmmssSS
   * (i.e. the data is in the IA 14 digit format).
   */
   private static final Pattern pattern =
    Pattern.compile("^(\\d{14}+)$");


  public BooleanQuery filter(Query input, BooleanQuery output)
    throws QueryException
  {
    // Examine each clause in the Nutch query
    Clause [] clauses = input.getClauses();
   
    for (int i = 0; i < clauses.length; i++)
    {
      Clause c = clauses[i];

      // Skip if not date clauses
      if (!c.getField().equals(FIELD_NAME))
      {
        continue;
      }

      String dateTerm = c.getTerm().toString();
      Matcher matcher = pattern.matcher(dateTerm);
     
      if (matcher == null || !matcher.matches())
      {
        String message = "Wrong query syntax " + FIELD_NAME
          + ":" + dateTerm + ". Must be standalone 14 digit " +
          " IA format date.";
        LOGGER.error(message);
       
        throw new QueryException(message);
      }

      // So, date is in one format.
      // 14 character IA date.
      String d = matcher.group(1);
     
      if (d != null)
      {
        LOGGER.debug("Found single date: " + d);

        // This is not a range query. Take the passed date and convert
        // it to seconds-since-epoch.     
        output.add(new PwaClosestQuery(getTerm(getSeconds(pad(d)))),
                (c.isProhibited()
                    ? BooleanClause.Occur.MUST_NOT
                    : (c.isRequired()
                        ? BooleanClause.Occur.MUST
                        : BooleanClause.Occur.SHOULD
                       )
                 ));
       
        continue;
      }  

      String message = "Unparseable query " + dateTerm + " (Is " +
        "it in 14 digit IA date format?)";
     
      LOGGER.error(message);
     
      throw new QueryException(message);
    }

    return output;
  }
 
  protected int getSeconds(String s) throws QueryException
  {
    Date d = null;
   
    try
    {
      d = ArchiveUtils.parse14DigitDate(s);
    }
    catch (Exception e)
    {
      String message = "Failed parse of " + s + e.getMessage();
     
      throw new QueryException(message);
    }
   
    long seconds = d.getTime()/1000;
   
    if (seconds > Integer.MAX_VALUE)
    {
      throw new RuntimeException("Seconds is larger than " +
        " Integer.MAX_VALUE: " + seconds);
    }
   
    return (int)seconds;
  }

  private Term getTerm(int seconds)
  {
    return new Term(FIELD_NAME, ArchiveUtils.zeroPadInteger(seconds));
  }

  private String pad(String s)
  {
    return ArchiveUtils.padTo(s, 14, '0');
  }

  public Configuration getConf()
  {
    return this.conf;
  }

  public void setConf(Configuration conf)
  {
    this.conf = conf;
  }
}
TOP

Related Classes of org.archive.access.nutch.searcher.WaxDateClosestQueryFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.