Package org.wltea.analyzer.lucene

Source Code of org.wltea.analyzer.lucene.IKQueryParser$ExpressionParser

/**
*
*/
package org.wltea.analyzer.lucene;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Stack;

import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;

/**
* IK查询分析器
* 实现了对分词歧义结果的非冲突排列组合
* 有效的优化对歧义关键词的搜索命中
* 针对IK Analyzer V3的优化实现
*
* @author 林良益
*
*/
public final class IKQueryParser {
 
 
  //查询关键字解析缓存线程本地变量
  private static ThreadLocal<Map<String , TokenBranch>> keywordCacheThreadLocal
      = new ThreadLocal<Map<String , TokenBranch>>();
 
 
  //是否采用最大词长分词
  private static boolean isMaxWordLength = false;

  /**
   * 设置分词策略
   * isMaxWordLength = true 采用最大词长分词
   * @param isMaxWordLength
   */
  public static void setMaxWordLength(boolean isMaxWordLength) {
    IKQueryParser.isMaxWordLength = isMaxWordLength ;
  }
 
  /**
   * 优化query队列
   * 减少Query表达式的嵌套
   * @param queries
   * @return
   */
  private static Query optimizeQueries(List<Query> queries){ 
    //生成当前branch 的完整query
    if(queries.size() == 0){
      return null;
    }else if(queries.size() == 1){
      return queries.get(0);
    }else{
      BooleanQuery mustQueries = new BooleanQuery();
      for(Query q : queries){
        mustQueries.add(q, Occur.MUST);
      }
      return mustQueries;
    }     
  }
 
  /**
   * 获取线程本地的解析缓存
   * @return
   */
  private static Map<String , TokenBranch> getTheadLocalCache(){
    Map<String , TokenBranch> keywordCache = keywordCacheThreadLocal.get();
    if(keywordCache == null){
       keywordCache = new HashMap<String , TokenBranch>(4);
       keywordCacheThreadLocal.set(keywordCache);
    }
    return keywordCache;
  }
 
  /**
   * 缓存解析结果的博弈树
   * @param query
   * @return
   */
  private static TokenBranch getCachedTokenBranch(String query){
    Map<String , TokenBranch> keywordCache = getTheadLocalCache();
    return keywordCache.get(query);
  }
 
  /**
   * 缓存解析结果的博弈树
   * @param query
   * @return
   */
  private static void cachedTokenBranch(String query , TokenBranch tb){
    Map<String , TokenBranch> keywordCache = getTheadLocalCache();
    keywordCache.put(query, tb);
  }
   
 
  /**
   * 单连续字窜(不带空格符)单Field查询分析
   * @param field
   * @param query
   * @return
   */
  public static Query parse(String field , String query){
    if(field == null){
      throw new IllegalArgumentException("parameter \"field\" is null");
    }

    if(query == null || "".equals(query.trim())){
      return new TermQuery(new Term(field));
    }
   
    //从缓存中取出已经解析的query生产的TokenBranch
    TokenBranch root = getCachedTokenBranch(query);
    if(root != null){
      return optimizeQueries(root.toQueries(field));
    }else{
      //System.out.println(System.currentTimeMillis());
      root = new TokenBranch(null);   
      //对查询条件q进行分词
      StringReader input = new StringReader(query.trim());
      IKSegmentation ikSeg = new IKSegmentation(input , isMaxWordLength);
      try {
        for(Lexeme lexeme = ikSeg.next() ; lexeme != null ; lexeme = ikSeg.next()){
          //处理词元分支
          root.accept(lexeme);
        }
      } catch (IOException e) {
        e.printStackTrace();
      }
      //缓存解析结果的博弈树
      cachedTokenBranch(query , root);
      return optimizeQueries(root.toQueries(field));
    }
  }
 
  /**
   * 解析IK简易查询表达式
   * @param ikQueryExp
   * @return Query 查询逻辑对象
   */
  public static Query parse(String ikQueryExp){
    ExpressionParser ikExpParser = new ExpressionParser();
    return ikExpParser.parserExp(ikQueryExp);
  }
 
  /**
   * 单条件,单Field查询分析
   * @param field -- Document field name
   * @param query -- keyword
   * @return Query 查询逻辑对象
   * @throws IOException
   */
//  public static Query parse(String field , String query){
//    if(field == null){
//      throw new IllegalArgumentException("parameter \"field\" is null");
//    }
//    String[] qParts = query.split("\\s");
//    if(qParts.length > 1){     
//      BooleanQuery resultQuery = new BooleanQuery();
//      for(String q : qParts){
//        //过滤掉由于连续空格造成的空字串
//        if("".equals(q)){
//          continue;
//        }
//        Query partQuery;
//        try {
//          partQuery = _parse(field , q);
//          if(partQuery != null &&
//                    (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
//            resultQuery.add(partQuery, Occur.MUST);
//          }
//        } catch (IOException e) {
//          e.printStackTrace();
//        }
//      }
//      return resultQuery;
//    }else{
//      try {
//        return _parse(field , query);
//      } catch (IOException e) {
//        e.printStackTrace();
//      }
//    }
//    return null;
//  }
 
  /**
   * 多Field,单条件查询分析
   * @param fields -- Document fields name
   * @param query  -- keyword
   * @return Query 查询逻辑对象
   * @throws IOException
   */
  public static Query parseMultiField(String[] fields , String query) throws IOException{
    if(fields == null){
      throw new IllegalArgumentException("parameter \"fields\" is null");
    }   
    BooleanQuery resultQuery = new BooleanQuery();   
    for(String field : fields){
      if(field != null){
        Query partQuery = parse(field , query);
        if(partQuery != null &&
                  (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
          resultQuery.add(partQuery, Occur.SHOULD);
        }
      }     
    }   
    return resultQuery;
  }
 
  /**
   * 多Field,单条件,多Occur查询分析
   * @param fields -- Document fields name
   * @param query  -- keyword
   * @param flags -- BooleanClause
   * @return Query 查询逻辑对象
   * @throws IOException
   */
  public static Query parseMultiField(String[] fields , String query ,  BooleanClause.Occur[] flags) throws IOException{
    if(fields == null){
      throw new IllegalArgumentException("parameter \"fields\" is null");
    }
    if(flags == null){
      throw new IllegalArgumentException("parameter \"flags\" is null");
    }
   
    if (flags.length != fields.length){
          throw new IllegalArgumentException("flags.length != fields.length");
    }   
   
    BooleanQuery resultQuery = new BooleanQuery();   
    for(int i = 0; i < fields.length; i++){
      if(fields[i] != null){
        Query partQuery = parse(fields[i] , query);
        if(partQuery != null &&
                  (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
          resultQuery.add(partQuery, flags[i]);
        }
      }     
    }   
    return resultQuery;
  }
 
  /**
   * 多Field多条件查询分析
   * @param fields
   * @param queries
   * @return Query 查询逻辑对象
   * @throws IOException
   */
  public static Query parseMultiField(String[] fields , String[] queries) throws IOException{
    if(fields == null){
      throw new IllegalArgumentException("parameter \"fields\" is null");
    }       
    if(queries == null){
      throw new IllegalArgumentException("parameter \"queries\" is null");
    }       
    if (queries.length != fields.length){
          throw new IllegalArgumentException("queries.length != fields.length");
    }
    BooleanQuery resultQuery = new BooleanQuery();   
    for(int i = 0; i < fields.length; i++){
      if(fields[i] != null){
        Query partQuery = parse(fields[i] , queries[i]);
        if(partQuery != null &&
                  (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
          resultQuery.add(partQuery, Occur.SHOULD);
        }
      }     
    }   
    return resultQuery;
  }

  /**
   * 多Field,多条件,多Occur查询分析
   * @param fields
   * @param queries
   * @param flags
   * @return Query 查询逻辑对象
   * @throws IOException
   */
  public static Query parseMultiField(String[] fields , String[] queries , BooleanClause.Occur[] flags) throws IOException{
    if(fields == null){
      throw new IllegalArgumentException("parameter \"fields\" is null");
    }       
    if(queries == null){
      throw new IllegalArgumentException("parameter \"queries\" is null");
    }
    if(flags == null){
      throw new IllegalArgumentException("parameter \"flags\" is null");
    }
   
      if (!(queries.length == fields.length && queries.length == flags.length)){
          throw new IllegalArgumentException("queries, fields, and flags array have have different length");
      }

      BooleanQuery resultQuery = new BooleanQuery();   
    for(int i = 0; i < fields.length; i++){
      if(fields[i] != null){
        Query partQuery = parse(fields[i] , queries[i]);
        if(partQuery != null &&
                  (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
          resultQuery.add(partQuery, flags[i]);
        }
      }     
    }   
    return resultQuery;
 
  /**
   * 词元分支
   * 当分词出现歧义时,采用词元分支容纳不同的歧义组合
   * @author 林良益
   *
   */
  private static class TokenBranch{
   
    private static final int REFUSED = -1;
    private static final int ACCEPTED = 0;
    private static final int TONEXT = 1;
   
    //词元分支左边界
    private int leftBorder;
    //词元分支右边界
    private int rightBorder;
    //当前分支主词元
    private Lexeme lexeme;
    //当前分支可并入的词元分支
    private List<TokenBranch> acceptedBranchs;
    //当前分支的后一个相邻分支
    private TokenBranch nextBranch;
   
    TokenBranch(Lexeme lexeme){
      if(lexeme != null){
        this.lexeme = lexeme;
        //初始化branch的左右边界
        this.leftBorder = lexeme.getBeginPosition();
        this.rightBorder = lexeme.getEndPosition();
      }
    }
   
    public int getLeftBorder() {
      return leftBorder;
    }

    public int getRightBorder() {
      return rightBorder;
    }

    public Lexeme getLexeme() {
      return lexeme;
    }

    public List<TokenBranch> getAcceptedBranchs() {
      return acceptedBranchs;
    }

    public TokenBranch getNextBranch() {
      return nextBranch;
    }

    public int hashCode(){
      if(this.lexeme == null){
        return 0;
      }else{
        return this.lexeme.hashCode() * 37;
      }
    }
   
    public boolean equals(Object o){     
      if(o == null){
        return false;
      }   
      if(this == o){
        return true;
      }
      if(o instanceof TokenBranch){
        TokenBranch other = (TokenBranch)o;
        if(this.lexeme == null ||
            other.getLexeme() == null){
          return false;
        }else{
          return this.lexeme.equals(other.getLexeme());
        }
      }else{
        return false;
      }     
    } 
   
    /**
     * 组合词元分支
     * @param _lexeme
     * @return 返回当前branch能否接收词元对象
     */
    boolean accept(Lexeme _lexeme){
     
      /*
       * 检查新的lexeme 对当前的branch 的可接受类型
       * acceptType : REFUSED  不能接受
       * acceptType : ACCEPTED 接受
       * acceptType : TONEXT   由相邻分支接受
       */     
      int acceptType = checkAccept(_lexeme);     
      switch(acceptType){
      case REFUSED:
        // REFUSE 情况
        return false;
       
      case ACCEPTED :
        if(acceptedBranchs == null){
          //当前branch没有子branch,则添加到当前branch下
          acceptedBranchs = new ArrayList<TokenBranch>(2);
          acceptedBranchs.add(new TokenBranch(_lexeme));         
        }else{
          boolean acceptedByChild = false;
          //当前branch拥有子branch,则优先由子branch接纳
          for(TokenBranch childBranch : acceptedBranchs){
            acceptedByChild = childBranch.accept(_lexeme) || acceptedByChild;
          }
          //如果所有的子branch不能接纳,则由当前branch接纳
          if(!acceptedByChild){
            acceptedBranchs.add(new TokenBranch(_lexeme));
          }         
        }
        //设置branch的最大右边界
        if(_lexeme.getEndPosition() > this.rightBorder){
          this.rightBorder = _lexeme.getEndPosition();
        }
        break;
       
      case TONEXT :
        //把lexeme放入当前branch的相邻分支
        if(this.nextBranch == null){
          //如果还没有相邻分支,则建立一个不交叠的分支
          this.nextBranch = new TokenBranch(null);
        }
        this.nextBranch.accept(_lexeme);
        break;
      }

      return true;
    }
   
    /**
     * 将分支数据转成Query逻辑
     * @return
     */
    List<Query> toQueries(String fieldName){     
      List<Query> queries = new ArrayList<Query>(1);     
       //生成当前branch 的query
      if(lexeme != null){
        queries.add(new TermQuery(new Term(fieldName , lexeme.getLexemeText())));
      }     
      //生成child branch 的query
      if(acceptedBranchs != null && acceptedBranchs.size() > 0){
        if(acceptedBranchs.size() == 1){
          Query onlyOneQuery = optimizeQueries(acceptedBranchs.get(0).toQueries(fieldName));
          if(onlyOneQuery != null){
            queries.add(onlyOneQuery);
          }         
        }else{
          BooleanQuery orQuery = new BooleanQuery();
          for(TokenBranch childBranch : acceptedBranchs){
            Query childQuery = optimizeQueries(childBranch.toQueries(fieldName));
            if(childQuery != null){
              orQuery.add(childQuery, Occur.SHOULD);
            }
          }
          if(orQuery.getClauses().length > 0){
            queries.add(orQuery);
          }
        }
      }     
      //生成nextBranch的query
      if(nextBranch != null){       
        queries.addAll(nextBranch.toQueries(fieldName));
      }
      return queries; 
    }
   
    /**
     * 判断指定的lexeme能否被当前的branch接受
     * @param lexeme
     * @return 返回接受的形式
     */
    private int checkAccept(Lexeme _lexeme){
      int acceptType = 0;
     
      if(_lexeme == null){
        throw new IllegalArgumentException("parameter:lexeme is null");
      }
     
      if(null == this.lexeme){//当前的branch是一个不交叠(ROOT)的分支
        if(this.rightBorder > //说明当前branch内至少有一个lexeme
            && _lexeme.getBeginPosition() >= this.rightBorder){
          //_lexeme 与 当前的branch不相交
          acceptType = TONEXT;
        }else{
          acceptType = ACCEPTED;
        }       
      }else{//当前的branch是一个有交叠的分支
       
        if(_lexeme.getBeginPosition() < this.lexeme.getBeginPosition()){
          //_lexeme 的位置比 this.lexeme还靠前(这种情况不应该发生)
          acceptType = REFUSED;
        }else if(_lexeme.getBeginPosition() >= this.lexeme.getBeginPosition()
              && _lexeme.getBeginPosition() < this.lexeme.getEndPosition()){
          // _lexeme 与 this.lexeme相交
          acceptType = REFUSED;
        }else if(_lexeme.getBeginPosition() >= this.lexeme.getEndPosition()
              && _lexeme.getBeginPosition() < this.rightBorder){
          //_lexeme 与 this.lexeme 不相交, 但_lexeme 与 当前的branch相交
          acceptType = ACCEPTED;
        }else{//_lexeme.getBeginPosition() >= this.rightBorder
          //_lexeme 与 当前的branch不相交
          acceptType=  TONEXT;
        }
      }
      return acceptType;
    }
 
  }
 
  /**
   * 查询表达式解析
   * alpha版本
   * 自定义lucene查询表达式
   * 表达式例子 :
   * (id='1231231' && title:'monkey') || (content:'你好吗'  || ulr='www.ik.com') - name:'helloword'
   *
   * @author linliangyi
   * May 20, 2010
   */
  static class ExpressionParser {
   
    //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],";
   
    private List<Element> elements = new ArrayList<Element>();
   
    private Stack<Query> querys =  new Stack<Query>();
   
    private Stack<Element> operates = new Stack<Element>();
   
    public ExpressionParser(){
    }

    /**
     * 解析查询表达式,生成Lucene Query对象
     *
     * @param expression
     * @return
     */
    public Query parserExp(String expression){
      Query lucenceQuery = null;
      try{
        //文法解析
        this.splitElements(expression);
        //语法解析
        this.parseSyntax();
        if(this.querys.size() == 1){
          lucenceQuery = this.querys.pop();
        }else{
          throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失");
        }
      }finally{
        elements.clear();
        querys.clear();
        operates.clear();
      }   
      return lucenceQuery;
    }
   
    /**
     * 表达式文法解析
     * @param expression
     */
    private void splitElements(String expression){
      
      if(expression == null){
        return;
      }
      Element curretElement = null;
     
      char[] expChars = expression.toCharArray();
      for(int i = 0 ; i < expChars.length ; i++){
        switch(expChars[i]){
        case '&' :
          if(curretElement == null){
            curretElement = new Element();
            curretElement.type = '&';
            curretElement.append(expChars[i]);
          }else if(curretElement.type == '&'){
            curretElement.append(expChars[i]);
            this.elements.add(curretElement);
            curretElement = null;
          }else if(curretElement.type == '\''){
            curretElement.append(expChars[i]);
          }else {
            this.elements.add(curretElement);
            curretElement = new Element();
            curretElement.type = '&';
            curretElement.append(expChars[i]);
          }
          break;
         
        case '|' :
          if(curretElement == null){
            curretElement = new Element();
            curretElement.type = '|';
            curretElement.append(expChars[i]);
          }else if(curretElement.type == '|'){
            curretElement.append(expChars[i]);
            this.elements.add(curretElement);
            curretElement = null;
          }else if(curretElement.type == '\''){
            curretElement.append(expChars[i]);
          }else {
            this.elements.add(curretElement);
            curretElement = new Element();
            curretElement.type = '|';
            curretElement.append(expChars[i]);
          }       
          break;
         
        case '-' :
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
              continue;
            }else{
              this.elements.add(curretElement);
            }
          }
          curretElement = new Element();
          curretElement.type = '-';
          curretElement.append(expChars[i]);
          this.elements.add(curretElement);
          curretElement = null;     
          break;

        case '(' :
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
              continue;
            }else{
              this.elements.add(curretElement);
            }
          }
          curretElement = new Element();
          curretElement.type = '(';
          curretElement.append(expChars[i]);
          this.elements.add(curretElement);
          curretElement = null;     
          break;       

        case ')' :
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
              continue;
            }else{
              this.elements.add(curretElement);
            }
          }
          curretElement = new Element();
          curretElement.type = ')';
          curretElement.append(expChars[i]);
          this.elements.add(curretElement);
          curretElement = null;     
          break;         

        case ':' :
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
              continue;
            }else{
              this.elements.add(curretElement);
            }
          }
          curretElement = new Element();
          curretElement.type = ':';
          curretElement.append(expChars[i]);
          this.elements.add(curretElement);
          curretElement = null;     
          break
       
        case '=' :
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
              continue;
            }else{
              this.elements.add(curretElement);
            }
          }
          curretElement = new Element();
          curretElement.type = '=';
          curretElement.append(expChars[i]);
          this.elements.add(curretElement);
          curretElement = null;     
          break;         

        case ' ' :
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
            }else{
              this.elements.add(curretElement);
              curretElement = null;
            }
          }
         
          break;
       
        case '\'' :
          if(curretElement == null){
            curretElement = new Element();
            curretElement.type = '\'';
           
          }else if(curretElement.type == '\''){
            this.elements.add(curretElement);
            curretElement = null;
           
          }else{
            this.elements.add(curretElement);
            curretElement = new Element();
            curretElement.type = '\'';
           
          }
          break;
         
        case '[':
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
              continue;
            }else{
              this.elements.add(curretElement);
            }
          }
          curretElement = new Element();
          curretElement.type = '[';
          curretElement.append(expChars[i]);
          this.elements.add(curretElement);
          curretElement = null;         
          break;
         
        case ']':
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
              continue;
            }else{
              this.elements.add(curretElement);
            }
          }
          curretElement = new Element();
          curretElement.type = ']';
          curretElement.append(expChars[i]);
          this.elements.add(curretElement);
          curretElement = null;
         
          break;
         
        case '{':
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
              continue;
            }else{
              this.elements.add(curretElement);
            }
          }
          curretElement = new Element();
          curretElement.type = '{';
          curretElement.append(expChars[i]);
          this.elements.add(curretElement);
          curretElement = null;         
          break;
         
        case '}':
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
              continue;
            }else{
              this.elements.add(curretElement);
            }
          }
          curretElement = new Element();
          curretElement.type = '}';
          curretElement.append(expChars[i]);
          this.elements.add(curretElement);
          curretElement = null;
         
          break;
        case ',':
          if(curretElement != null){
            if(curretElement.type == '\''){
              curretElement.append(expChars[i]);
              continue;
            }else{
              this.elements.add(curretElement);
            }
          }
          curretElement = new Element();
          curretElement.type = ',';
          curretElement.append(expChars[i]);
          this.elements.add(curretElement);
          curretElement = null;
         
          break;
         
        default :
          if(curretElement == null){
            curretElement = new Element();
            curretElement.type = 'F';
            curretElement.append(expChars[i]);
           
          }else if(curretElement.type == 'F'){
            curretElement.append(expChars[i]);
           
          }else if(curretElement.type == '\''){
            curretElement.append(expChars[i]);

          }else{
            this.elements.add(curretElement);
            curretElement = new Element();
            curretElement.type = 'F';
            curretElement.append(expChars[i]);
          }     
        }
      }
     
      if(curretElement != null){
        this.elements.add(curretElement);
        curretElement = null;
      }
    }
   
    /**
     * 语法解析
     *
     */
    private void parseSyntax(){
      for(int i = 0 ; i < this.elements.size() ; i++){
        Element e = this.elements.get(i);
        if('F' == e.type){
          Element e2 = this.elements.get(i + 1);
          if('=' != e2.type && ':' != e2.type){
            throw new IllegalStateException("表达式异常: = 或 : 号丢失");
          }
          Element e3 = this.elements.get(i + 2);
          //处理 = 和 : 运算
          if('\'' == e3.type){
            i+=2;
            if('=' == e2.type){
              TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString()));
              this.querys.push(tQuery);
            }else if(':' == e2.type){
              String keyword = e3.toString();
              if(keyword.startsWith("^") && keyword.endsWith("$")){
                Query pQuery = this.luceneQueryParse(e.toString(), keyword);
                this.querys.push(pQuery);
              }else{
                Query tQuery = IKQueryParser.parse(e.toString(), e3.toString());
                this.querys.push(tQuery);
              }
            }
           
          }else if('[' == e3.type || '{' == e3.type){
            i+=2;
            //处理 [] 和 {}
            LinkedList<Element> eQueue = new LinkedList<Element>();
            eQueue.add(e3);
            for( i++ ; i < this.elements.size() ; i++){             
              Element eN = this.elements.get(i);
              eQueue.add(eN);
              if(']' == eN.type || '}' == eN.type){
                break;
              }
            }
            //翻译RangeQuery
            Query rangeQuery = this.toTermRangeQuery(e , eQueue);
            this.querys.push(rangeQuery);
          }else{
            throw new IllegalStateException("表达式异常:匹配值丢失");
          }
         
        }else if('(' == e.type){
          this.operates.push(e);
         
        }else if(')' == e.type){
          boolean doPop = true;
          while(doPop && !this.operates.empty()){
            Element op = this.operates.pop();
            if('(' == op.type){
              doPop = false;
            }else {
              Query q = toQuery(op);
              this.querys.push(q);
            }
           
          }
        }else{
         
          if(this.operates.isEmpty()){
            this.operates.push(e);
          }else{
            boolean doPeek = true;
            while(doPeek && !this.operates.isEmpty()){
              Element eleOnTop = this.operates.peek();
              if('(' == eleOnTop.type){
                doPeek = false;
                this.operates.push(e);
              }else if(compare(e , eleOnTop) == 1){
                this.operates.push(e);
                doPeek = false;
              }else if(compare(e , eleOnTop) == 0){
                Query q = toQuery(eleOnTop);
                this.operates.pop();
                this.querys.push(q);
              }else{
                Query q = toQuery(eleOnTop);
                this.operates.pop();
                this.querys.push(q);
              }
            }
           
            if(doPeek && this.operates.empty()){
              this.operates.push(e);
            }
          }
        }     
      }
     
      while(!this.operates.isEmpty()){
        Element eleOnTop = this.operates.pop();
        Query q = toQuery(eleOnTop);
        this.querys.push(q);     
      }   
    }
   
    /**
     * 根据逻辑操作符,生成BooleanQuery
     * @param op
     * @return
     */
    private Query toQuery(Element op){
      if(this.querys.size() == 0){
        return null;
      }
     
      BooleanQuery resultQuery = new BooleanQuery();

      if(this.querys.size() == 1){
        return this.querys.get(0);
      }
     
      Query q2 = this.querys.pop();
      Query q1 = this.querys.pop();
      if('&' == op.type){
        if(q1 != null){
          if(q1 instanceof BooleanQuery){
            BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
            if(clauses.length > 0
                && clauses[0].getOccur() == Occur.MUST){
              for(BooleanClause c : clauses){
                resultQuery.add(c);
              }         
            }else{
              resultQuery.add(q1,Occur.MUST);
            }

          }else{
            //q1 instanceof TermQuery
            //q1 instanceof TermRangeQuery
            //q1 instanceof PhraseQuery
            //others
            resultQuery.add(q1,Occur.MUST);
          }
        }
       
        if(q2 != null){
          if(q2 instanceof BooleanQuery){
            BooleanClause[] clauses = ((BooleanQuery)q2).getClauses();
            if(clauses.length > 0
                && clauses[0].getOccur() == Occur.MUST){
              for(BooleanClause c : clauses){
                resultQuery.add(c);
              }         
            }else{
              resultQuery.add(q2,Occur.MUST);
            }
           
          }else{
            //q1 instanceof TermQuery
            //q1 instanceof TermRangeQuery
            //q1 instanceof PhraseQuery
            //others
            resultQuery.add(q2,Occur.MUST);
          }
        }
       
      }else if('|' == op.type){
        if(q1 != null){
          if(q1 instanceof BooleanQuery){
            BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
            if(clauses.length > 0
                && clauses[0].getOccur() == Occur.SHOULD){
              for(BooleanClause c : clauses){
                resultQuery.add(c);
              }         
            }else{
              resultQuery.add(q1,Occur.SHOULD);
            }
           
          }else{
            //q1 instanceof TermQuery
            //q1 instanceof TermRangeQuery
            //q1 instanceof PhraseQuery
            //others
            resultQuery.add(q1,Occur.SHOULD);
          }
        }
       
        if(q2 != null){
          if(q2 instanceof BooleanQuery){
            BooleanClause[] clauses = ((BooleanQuery)q2).getClauses();
            if(clauses.length > 0
                && clauses[0].getOccur() == Occur.SHOULD){
              for(BooleanClause c : clauses){
                resultQuery.add(c);
              }         
            }else{
              resultQuery.add(q2,Occur.SHOULD);
            }
          }else{
            //q2 instanceof TermQuery
            //q2 instanceof TermRangeQuery
            //q2 instanceof PhraseQuery
            //others
            resultQuery.add(q2,Occur.SHOULD);
           
          }
        }
       
      }else if('-' == op.type){
        if(q1 == null || q2 == null){
          throw new IllegalStateException("表达式异常:SubQuery 个数不匹配");
        }
       
        if(q1 instanceof BooleanQuery){
          BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
          if(clauses.length > 0){
            for(BooleanClause c : clauses){
              resultQuery.add(c);
            }         
          }else{
            resultQuery.add(q1,Occur.MUST);
          }

        }else{
          //q1 instanceof TermQuery
          //q1 instanceof TermRangeQuery
          //q1 instanceof PhraseQuery
          //others
          resultQuery.add(q1,Occur.MUST);
        }       
       
        resultQuery.add(q2,Occur.MUST_NOT);
      }
      return resultQuery;
    }
   
    /**
     * 组装TermRangeQuery
     * @param elements
     * @return
     */
    private TermRangeQuery toTermRangeQuery(Element fieldNameEle , LinkedList<Element> elements){

      boolean includeFirst = false;
      boolean includeLast = false;
      String firstValue = null;
      String lastValue = null;
      //检查第一个元素是否是[或者{
      Element first = elements.getFirst();
      if('[' == first.type){
        includeFirst = true;
      }else if('{' == first.type){
        includeFirst = false;
      }else {
        throw new IllegalStateException("表达式异常");
      }
      //检查最后一个元素是否是]或者}
      Element last = elements.getLast();
      if(']' == last.type){
        includeLast = true;
      }else if('}' == last.type){
        includeLast = false;
      }else {
        throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号");
      }
      if(elements.size() < 4 || elements.size() > 5){
        throw new IllegalStateException("表达式异常, RangeQuery 错误");
      }     
      //读出中间部分
      Element e2 = elements.get(1);
      if('\'' == e2.type){
        firstValue = e2.toString();
        //
        Element e3 = elements.get(2);
        if(',' != e3.type){
          throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔");
        }
        //
        Element e4 = elements.get(3);
        if('\'' == e4.type){
          lastValue = e4.toString();
        }else if(e4 != last){
          throw new IllegalStateException("表达式异常,RangeQuery格式错误");
        }       
      }else if(',' == e2.type){
        firstValue = null;
        //
        Element e3 = elements.get(2);
        if('\'' == e3.type){
          lastValue = e3.toString();
        }else{
          throw new IllegalStateException("表达式异常,RangeQuery格式错误");
        }
       
      }else {
        throw new IllegalStateException("表达式异常, RangeQuery格式错误");
      }
     
      return new TermRangeQuery(fieldNameEle.toString() , firstValue , lastValue , includeFirst , includeLast);
    }
   
    /**
     * 组装Lucene Query
     * 处理关键字紧凑搜索
     * @param elements
     * @return
     */
    private Query luceneQueryParse(String fieldName , String keyword){
      //截取头部^尾部$
      keyword = keyword.substring(1 , keyword.length() - 1);
      String luceneExp = fieldName + ":\"" + keyword + "\"";
      QueryParser luceneQueryParser = new QueryParser(Version.LUCENE_30 , "" ,new IKAnalyzer());
      try {
        Query lucenceQuery = luceneQueryParser.parse(luceneExp);
        return lucenceQuery;
      } catch (ParseException e) {
        e.printStackTrace();
      }               
      return null;     
    }
   
   
    /**
     * 比较操作符优先级
     * @param e1
     * @param e2
     * @return
     */
    private int compare(Element e1 , Element e2){
      if('&' == e1.type){
        if('&' == e2.type){
          return 0;
        }else {
          return 1;
        }
      }else if('|' == e1.type){
        if('&' == e2.type){
          return -1;
        }else if('|' == e2.type){
          return 0;
        }else{
          return 1;
        }
      }else{
        if('-' == e2.type){
          return 0;
        }else{
          return -1;
        }
      }
    }
   
    /**
     * 表达式元素
     *
     * @author linliangyi
     * May 20, 2010
     */
    private class Element{
      char type = 0;
      StringBuffer eleTextBuff;

      public Element(){
        eleTextBuff = new StringBuffer();
      }
     
      public void append(char c){
        this.eleTextBuff.append(c);
      }
   
      public String toString(){
        return this.eleTextBuff.toString();
      }
     
    }
  }
  public static void main(String[] args){
    String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'^魔兽中国$') || (content:'魔兽 中国'  || ulr='www.ik.com') - name:'林良益'";
//    String ikQueryExp = "content:'----'  || title:'----' - name:'林良益'";
    Query result = IKQueryParser.parse(ikQueryExp);
//    Query result = IKQueryParser.parse("(newsKeyword='---' || newsTitle:'---' || newsContent:'---') && newsClass='1'");
    System.out.println(result);

 
}
TOP

Related Classes of org.wltea.analyzer.lucene.IKQueryParser$ExpressionParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.