Package org.apache.solr.handler.extraction

Source Code of org.apache.solr.handler.extraction.ExtractingRequestHandler

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.extraction;

import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.apache.solr.handler.ContentStreamHandlerBase;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MimeTypeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;



/**
* Handler for rich documents like PDF or Word or any other file format that Tika handles that need the text to be extracted
* first from the document.
* <p/>
*/
public class ExtractingRequestHandler extends ContentStreamHandlerBase implements SolrCoreAware {

  private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);

  public static final String CONFIG_LOCATION = "tika.config";

  public static final String DATE_FORMATS = "date.formats";

  protected TikaConfig config;

  protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;

  protected SolrContentHandlerFactory factory;

  @Override
  public void init(NamedList args) {
    super.init(args);
  }

  public void inform(SolrCore core) {
    if (initArgs != null) {
      //if relative,then relative to config dir, otherwise, absolute path
      String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
      if (tikaConfigLoc != null) {
        File configFile = new File(tikaConfigLoc);
        if (configFile.isAbsolute() == false) {
          configFile = new File(core.getResourceLoader().getConfigDir(), configFile.getPath());
        }
        try {
          config = new TikaConfig(configFile);
        } catch (Exception e) {
          throw new SolrException(ErrorCode.SERVER_ERROR, e);
        }
      }
      NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
      if (configDateFormats != null && configDateFormats.size() > 0) {
        dateFormats = new HashSet<String>();
        Iterator<Map.Entry> it = configDateFormats.iterator();
        while (it.hasNext()) {
          String format = (String) it.next().getValue();
          log.info("Adding Date Format: " + format);
          dateFormats.add(format);
        }
      }
    }
    if (config == null) {
      try {
        config = getDefaultConfig(core.getResourceLoader().getClassLoader());
      } catch (MimeTypeException e) {
        throw new SolrException(ErrorCode.SERVER_ERROR, e);
      } catch (IOException e) {
        throw new SolrException(ErrorCode.SERVER_ERROR, e);
      }
    }
    factory = createFactory();
  }

  protected SolrContentHandlerFactory createFactory() {
    return new SolrContentHandlerFactory(dateFormats);
  }

  @Override
  protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
    return new ExtractingDocumentLoader(req, processor, config, factory);
  }

  // ////////////////////// SolrInfoMBeans methods //////////////////////
  @Override
  public String getDescription() {
    return "Add/Update Rich document";
  }

  @Override
  public String getVersion() {
    return "$Revision:$";
  }

  @Override
  public String getSourceId() {
    return "$Id:$";
  }

  @Override
  public String getSource() {
    return "$URL:$";
  }
 
  private TikaConfig getDefaultConfig(ClassLoader classLoader) throws MimeTypeException, IOException {
    return new TikaConfig(classLoader);
  }

}
TOP

Related Classes of org.apache.solr.handler.extraction.ExtractingRequestHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.