Package com.gentics.cr.lucene.indexer.transformer.tika

Source Code of com.gentics.cr.lucene.indexer.transformer.tika.TikaParserTransformer

package com.gentics.cr.lucene.indexer.transformer.tika;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import com.gentics.cr.CRResolvableBean;
import com.gentics.cr.configuration.GenericConfiguration;
import com.gentics.cr.exceptions.CRException;
import com.gentics.cr.lucene.indexer.transformer.ContentTransformer;

/**
* Transformer to automatically parse various filetypes using autodetect of the mimetype.
* Uses the Tika framework internally for detection and parsing.
* Tika uses Apache POI and pdfbox.
*
* @author Friedreich Bernhard
*/
public class TikaParserTransformer extends ContentTransformer {

  /**
   * Config key to set the attribute to use for parsing.
   */
  private static final String TRANSFORMER_CONTENT_ATTRIBUTE_FIELD_KEY = "contentAttribute";
  private static final String TRANSFORMER_TARGET_ATTRIBUTE_FIELD_KEY = "targetAttribute";

  private static final String TRANSFORMER_CREATETIMESTAMP_ATTRIBUTE_FIELD_KEY = "createTimestampField";
  private static final String TRANSFORMER_EDITTIMESTAMP_ATTRIBUTE_FIELD_KEY = "editTimestampField";
  private static final String TRANSFORMER_PUBLISHTIMESTAMP_ATTRIBUTE_FIELD_KEY = "publishTimestampField";
  private static final String TRANSFORMER_HEADING_ATTRIBUTE_FIELD_KEY = "headingField";
  private static final String TRANSFORMER_KEYWORDS_ATTRIBUTE_FIELD_KEY = "keywordsField";
  private static final String TRANSFORMER_MIMETYPE_ATTRIBUTE_FIELD_KEY = "mimetypeField";

  private static final String TRANSFORMER_ALLOWED_LANGS_FIELD_KEY = "allowedLanguages";

  private static final String TRANSFORMER_DETECT_LANGUAGES_FIELD_KEY = "detectLanguages";

  /**
   * Field to store the config value representing the attribute to use for parsing.
   */
  private String contentAttributeField = "content";
  private String targetAttributeField = "binarycontent";
  private String createTimestampField = "createtimestamp";
  private String publishTimestampField = "publishtimestamp";
  private String editTimestampField = "edittimestamp";
  private String headingField = "heading";
  private String keywordsField = "keywords";
  private String mimetypeField = "mimetype";

  private List<String> allowedLanguages = null;

  private boolean languageDetection = false;

  /**
   * Disable file length limit.
   */
  private int fileLengthLimit = -1;

  /**
   * Parser provided by Tika to automatically detect the mimetype and afterwards do the parsing.
   */
  private final AutoDetectParser parser = new AutoDetectParser();

  /**
   * Tika parser framework.
   */
  private final Tika tika = new Tika();

  /**
   * Get new instance of TikaParserTransformer.
   * @param config Configuration for the transformer.
   */
  public TikaParserTransformer(final GenericConfiguration config) {
    super(config);
    if (config.get(TRANSFORMER_CONTENT_ATTRIBUTE_FIELD_KEY) != null) {
      contentAttributeField = config.getString(TRANSFORMER_CONTENT_ATTRIBUTE_FIELD_KEY);
    }
    if (config.get(TRANSFORMER_TARGET_ATTRIBUTE_FIELD_KEY) != null) {
      targetAttributeField = config.getString(TRANSFORMER_TARGET_ATTRIBUTE_FIELD_KEY);
    }

    if (config.get(TRANSFORMER_CREATETIMESTAMP_ATTRIBUTE_FIELD_KEY) != null) {
      createTimestampField = config.getString(TRANSFORMER_CREATETIMESTAMP_ATTRIBUTE_FIELD_KEY);
    }
    if (config.get(TRANSFORMER_EDITTIMESTAMP_ATTRIBUTE_FIELD_KEY) != null) {
      editTimestampField = config.getString(TRANSFORMER_EDITTIMESTAMP_ATTRIBUTE_FIELD_KEY);
    }
    if (config.get(TRANSFORMER_PUBLISHTIMESTAMP_ATTRIBUTE_FIELD_KEY) != null) {
      publishTimestampField = config.getString(TRANSFORMER_PUBLISHTIMESTAMP_ATTRIBUTE_FIELD_KEY);
    }
    if (config.get(TRANSFORMER_HEADING_ATTRIBUTE_FIELD_KEY) != null) {
      headingField = config.getString(TRANSFORMER_HEADING_ATTRIBUTE_FIELD_KEY);
    }
    if (config.get(TRANSFORMER_KEYWORDS_ATTRIBUTE_FIELD_KEY) != null) {
      keywordsField = config.getString(TRANSFORMER_KEYWORDS_ATTRIBUTE_FIELD_KEY);
    }
    if (config.get(TRANSFORMER_MIMETYPE_ATTRIBUTE_FIELD_KEY) != null) {
      mimetypeField = config.getString(TRANSFORMER_MIMETYPE_ATTRIBUTE_FIELD_KEY);
    }

    if (config.get(TRANSFORMER_ALLOWED_LANGS_FIELD_KEY) != null) {
      allowedLanguages = Arrays.asList(config.getString(TRANSFORMER_ALLOWED_LANGS_FIELD_KEY).split(","));
    }
    if (config.get(TRANSFORMER_DETECT_LANGUAGES_FIELD_KEY) != null) {
      languageDetection = config.getBoolean(TRANSFORMER_DETECT_LANGUAGES_FIELD_KEY);
    }
  }

  @Override
  public void processBean(final CRResolvableBean bean) throws CRException {
    if (this.contentAttributeField != null) {
      Object obj = bean.get(this.contentAttributeField);
      if (obj != null) {
        TikaInputStream inputStream = null;
        if (obj instanceof byte[]) {
          inputStream = TikaInputStream.get((byte[]) obj);
        } else {
          throw new IllegalArgumentException("Parameter must be instance of byte[]");
        }

        ContentHandler textHandler = new BodyContentHandler(fileLengthLimit);
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();

        try {
          metadata.set(Metadata.CONTENT_TYPE, tika.detect(inputStream));

          parser.parse(inputStream, textHandler, metadata, context);

          bean.set(headingField, metadata.get(TikaCoreProperties.TITLE));

          if (bean.get(createTimestampField) == null) {
            bean.set(createTimestampField, metadata.get(TikaCoreProperties.CREATED));
          }
          if (bean.get(editTimestampField) == null) {
            bean.set(editTimestampField, metadata.get(TikaCoreProperties.MODIFIED));
          }
          if (bean.get(keywordsField) == null) {
            bean.set(keywordsField, metadata.get(TikaCoreProperties.KEYWORDS));
          }
          if (bean.get(publishTimestampField) == null) {
            bean.set(publishTimestampField, metadata.get(TikaCoreProperties.PRINT_DATE));
          }
          if (bean.get(mimetypeField) == null) {
            //HttpHeaders.CONTENT_TYPE
            bean.set(mimetypeField, metadata.get(Metadata.CONTENT_TYPE));
          }

          String content = prepareContent(bean, textHandler);
          bean.set(this.targetAttributeField, content);

        } catch (IOException e) {
          LOGGER.error("Error reading inputstream from bean: " + bean.getContentid(), e);
        } catch (SAXException e) {
          LOGGER.error("Sax Parser Exception while reading inputstream from bean: " + bean.getContentid(), e);
        } catch (TikaException e) {
          LOGGER.error("Tika Parser Exception while reading inputstream  from bean: " + bean.getContentid(), e);
        } catch (Exception e) {
          LOGGER.error("Exception occured while indexing file at bean: " + bean.getContentid(), e);
        } finally {
          try {
            if (inputStream != null) {
              inputStream.close();
            }
          } catch (IOException e) {
            LOGGER.error("Could not close inputstream of bean: " + bean.getContentid(), e);
          }
        }
      }
    } else {
      LOGGER.error("Configured attribute is null. Bean will not be processed");
    }
  }

  private String prepareContent(final CRResolvableBean bean, ContentHandler textHandler) {
    String content = textHandler.toString();

    if (languageDetection) {
      String languageCode = bean.getString("languagecode");
      if (languageCode == null || languageCode.equals("")) {
        LanguageIdentifier identifier = new LanguageIdentifier(content);
        String lang = identifier.getLanguage();
        if (identifier.isReasonablyCertain() && (allowedLanguages == null || allowedLanguages.contains(lang))) {
          bean.set("languagecode", lang);
        }
      }
    }

    return content;
  }

  @Override
  public void destroy() {

  }
}
TOP

Related Classes of com.gentics.cr.lucene.indexer.transformer.tika.TikaParserTransformer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.