Package cc.twittertools.search.api

Source Code of cc.twittertools.search.api.TrecSearchHandler

/**
* Twitter Tools
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cc.twittertools.search.api;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;

import javax.annotation.Nullable;

import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeFilter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.LMDirichletSimilarity;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import cc.twittertools.index.IndexStatuses;
import cc.twittertools.index.IndexStatuses.StatusField;
import cc.twittertools.thrift.gen.TQuery;
import cc.twittertools.thrift.gen.TResult;
import cc.twittertools.thrift.gen.TrecSearch;
import cc.twittertools.thrift.gen.TrecSearchException;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

public class TrecSearchHandler implements TrecSearch.Iface {
  private static final Logger LOG = Logger.getLogger(TrecSearchHandler.class);

  private static QueryParser QUERY_PARSER =
      new QueryParser(Version.LUCENE_43, StatusField.TEXT.name, IndexStatuses.ANALYZER);

  private final IndexSearcher searcher;
  private final Map<String, String> credentials;

  public TrecSearchHandler(File indexPath, @Nullable Map<String, String> credentials)
      throws IOException {
    Preconditions.checkNotNull(indexPath);
    Preconditions.checkArgument(indexPath.exists());

    // Can be null, in which case we don't check for credentials.
    this.credentials = credentials;

    IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
    searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new LMDirichletSimilarity(2500.0f));
  }

  public List<TResult> search(TQuery query) throws TrecSearchException {
    Preconditions.checkNotNull(query);

    // Verify credentials.
    if (credentials != null && (!credentials.containsKey(query.group) ||
        !credentials.get(query.group).equals(query.token))) {
      LOG.info(String.format("Access denied for (%s, %s)", query.group, query.token));
      throw new TrecSearchException("Invalid credentials: access denied.");
    }

    List<TResult> results = Lists.newArrayList();
    long startTime = System.currentTimeMillis();

    try {
      Filter filter =
          NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, query.max_id, true, true);

      Query q = QUERY_PARSER.parse(query.text);
      int num = query.num_results > 10000 ? 10000 : query.num_results;
      TopDocs rs = searcher.search(q, filter, num);
      for (ScoreDoc scoreDoc : rs.scoreDocs) {
        Document hit = searcher.doc(scoreDoc.doc);

        TResult p = new TResult();
        p.id = (Long) hit.getField(StatusField.ID.name).numericValue();
        p.screen_name = hit.get(StatusField.SCREEN_NAME.name);
        p.epoch = (Long) hit.getField(StatusField.EPOCH.name).numericValue();
        p.text = hit.get(StatusField.TEXT.name);
        p.rsv = scoreDoc.score;

        p.followers_count = (Integer) hit.getField(StatusField.FOLLOWERS_COUNT.name).numericValue();
        p.statuses_count = (Integer) hit.getField(StatusField.STATUSES_COUNT.name).numericValue();

        if ( hit.get(StatusField.LANG.name) != null) {
          p.lang = hit.get(StatusField.LANG.name);
        }

        if ( hit.get(StatusField.IN_REPLY_TO_STATUS_ID.name) != null) {
          p.in_reply_to_status_id = (Long) hit.getField(StatusField.IN_REPLY_TO_STATUS_ID.name).numericValue();
        }

        if ( hit.get(StatusField.IN_REPLY_TO_USER_ID.name) != null) {
          p.in_reply_to_user_id = (Long) hit.getField(StatusField.IN_REPLY_TO_USER_ID.name).numericValue();
        }

        if ( hit.get(StatusField.RETWEETED_STATUS_ID.name) != null) {
          p.retweeted_status_id = (Long) hit.getField(StatusField.RETWEETED_STATUS_ID.name).numericValue();
        }

        if ( hit.get(StatusField.RETWEETED_USER_ID.name) != null) {
          p.retweeted_user_id = (Long) hit.getField(StatusField.RETWEETED_USER_ID.name).numericValue();
        }

        if ( hit.get(StatusField.RETWEET_COUNT.name) != null) {
          p.retweeted_count = (Integer) hit.getField(StatusField.RETWEET_COUNT.name).numericValue();
        }

        results.add(p);
      }
    } catch (Exception e) {
      e.printStackTrace();
      throw new TrecSearchException(e.getMessage());
    }

    long endTime = System.currentTimeMillis();
    LOG.info(String.format("%4dms %s", (endTime - startTime), query.toString()));

    return results;
  }
}
TOP

Related Classes of cc.twittertools.search.api.TrecSearchHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.