Package org.apache.nutch.searcher.more

Source Code of org.apache.nutch.searcher.more.DateQueryFilter

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.searcher.more;

import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.searcher.QueryFilter;
import org.apache.nutch.searcher.QueryException;

import org.apache.hadoop.conf.Configuration;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.index.Term;

import java.util.regex.Pattern;
import java.util.regex.Matcher;


/**
* Handles "date:" query clauses, causing them to search the field "date"
* indexed by MoreIndexingFilter.java
*
* @author John Xing
*/
public class DateQueryFilter implements QueryFilter {

  public static final Log LOG = LogFactory.getLog(DateQueryFilter.class);

  private static final String FIELD_NAME = "date";

  // query syntax is defined as date:yyyymmdd-yyyymmdd
  private static final Pattern pattern = Pattern.compile("^(\\d{8})-(\\d{8})$");

  private Configuration conf;
   
  public BooleanQuery filter(Query input, BooleanQuery output)
    throws QueryException {

    // examine each clause in the Nutch query
    Clause[] clauses = input.getClauses();
   
    for (int i = 0; i <clauses.length; i++) {
      Clause c = clauses[i];
     
      //skip if not date clauses
      if (!c.getField().equals(FIELD_NAME))
        continue;
           
      String x = c.getTerm().toString();
      
      Matcher matcher = pattern.matcher(x);
      if (!matcher.matches()) {
        throw new QueryException("Wrong query syntax "+FIELD_NAME+":"+x);
      }

      // do it as lucene RangeQuery
      Term xLower = new Term(FIELD_NAME, matcher.group(1));
      Term xUpper = new Term(FIELD_NAME, matcher.group(2));

      // inclusive
      RangeQuery rangeQuery = new RangeQuery(xLower, xUpper, true);

      rangeQuery.setBoost(0.0f);                  // trigger filterization
         
      output.add(rangeQuery,
          (c.isProhibited()
              ? BooleanClause.Occur.MUST_NOT
              : (c.isRequired()
                  ? BooleanClause.Occur.MUST
                  : BooleanClause.Occur.SHOULD
                 )
           ));
            
    }

    return output;
  }

  public void setConf(Configuration conf) {
    this.conf = conf;
  }

  public Configuration getConf() {
    return this.conf;
  }
}
TOP

Related Classes of org.apache.nutch.searcher.more.DateQueryFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.