Package com.dotcms.tika

Source Code of com.dotcms.tika.TikaUtils

package com.dotcms.tika;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.GZIPOutputStream;

import com.dotcms.repackage.org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import com.dotcms.repackage.org.apache.commons.io.IOUtils;
import com.dotcms.repackage.org.apache.commons.io.input.ReaderInputStream;
import com.dotcms.repackage.org.apache.tika.Tika;
import com.dotcms.repackage.org.apache.tika.io.TikaInputStream;
import com.dotcms.repackage.org.apache.tika.metadata.Metadata;
import com.dotmarketing.business.APILocator;
import com.dotmarketing.portlets.fileassets.business.FileAssetAPI;
import com.dotmarketing.util.Config;
import com.dotmarketing.util.Logger;
import com.dotmarketing.util.StringUtils;
import com.dotmarketing.util.UtilMethods;

public class TikaUtils {


  /**
   * Right now the method use the Tika facade directly for parse the document without any kind of restriction about the parser because the
   * new Tika().parse method use the AutoDetectParser by default.
   *
   *
   * @author Graziano Aliberti - Engineering Ingegneria Informatica S.p.a
   *
   * May 31, 2013 - 12:27:19 PM
   */
  public Map<String, String> getMetaDataMap(String inode, File binFile, String mimeType, boolean forceMemory) {
    Map<String, String> metaMap = new HashMap<String, String>();

    // store content metadata on disk
        File contentM=APILocator.getFileAssetAPI().getContentMetadataFile(inode);

    Tika t = new Tika();
    Metadata met = new Metadata();
    t.setMaxStringLength(-1);
    Reader fulltext = null;
    InputStream is = null;
    // if the limit is not "unlimited"
    // I can use the faster parseToString
    try {

      if(forceMemory){
        // no worry about the limit and less time to process.
        String content = t.parseToString(new FileInputStream(binFile), met);
        metaMap = new HashMap<String, String>();
        for (int i = 0; i < met.names().length; i++) {
          String name = met.names()[i];
          if (UtilMethods.isSet(name) && met.get(name) != null) {
            // we will want to normalize our metadata for searching
            String[] x = translateKey(name);
            for (String y : x)
              metaMap.put(y, met.get(name));
          }
        }
        metaMap.put(FileAssetAPI.CONTENT_FIELD, content);
      }
      else {


        is = TikaInputStream.get(binFile);
        fulltext = t.parse(is, met);
        metaMap = new HashMap<String, String>();
        for (int i = 0; i < met.names().length; i++) {
          String name = met.names()[i];
          if (UtilMethods.isSet(name) && met.get(name) != null) {
            // we will want to normalize our metadata for searching
            String[] x = translateKey(name);
            for (String y : x)
              metaMap.put(y, met.get(name));
          }
        }

        if(!contentM.exists() && contentM.getParentFile().mkdirs() && contentM.createNewFile()) {
          OutputStream out=new FileOutputStream(contentM);

          // compressor config
          String compressor=Config.getStringProperty("CONTENT_METADATA_COMPRESSOR", "none");
          if(compressor.equals("gzip")) {
            out = new GZIPOutputStream(out);
          }
          else if(compressor.equals("bzip2")) {
            out = new BZip2CompressorOutputStream(out);
          }

          ReaderInputStream ris = null;

          try {
            int count;
            ris = new ReaderInputStream(fulltext, StandardCharsets.UTF_8);

            int metadataLimit = Config.getIntProperty("META_DATA_MAX_SIZE", 5) * 1024 * 1024;
            int numOfChunks = metadataLimit / 1024;

            char[] buf = new char[1024];
            byte[] bytes = new byte[1024];

            while ((count = fulltext.read(buf)) > 0 && numOfChunks>0) {
              String lowered = new String(buf);
              lowered = lowered.toLowerCase();
              bytes = lowered.getBytes(StandardCharsets.UTF_8);
              out.write(bytes, 0, count);
              numOfChunks --;
            }
          }catch(IOException ioExc){
            Logger.debug( this.getClass(), "Error Reading TikaParse Stream.", ioExc );
          }finally {
            if ( out != null ) {
              try {
                out.close();
              } catch ( IOException e ) {
                Logger.warn( this.getClass(), "Error Closing Stream.", e );
              }
            }

            if ( ris != null ) {
              try {
                ris.close();
              } catch ( IOException e ) {
                Logger.warn( this.getClass(), "Error Closing Stream.", e );
              }
            }

            IOUtils.closeQuietly(out);
            IOUtils.closeQuietly(fulltext);
          }
        }
      }
    } catch (Exception e) {
      Logger.error(this.getClass(), "Could not parse file metadata for file : " + binFile.getAbsolutePath() + ". " +e.getMessage());
    }
    finally {
      if(null!=fulltext)
        IOUtils.closeQuietly(fulltext);
      if(null!=is)
        IOUtils.closeQuietly(is);
      try{
        metaMap.put(FileAssetAPI.SIZE_FIELD, String.valueOf(binFile.length()));
      }
      catch(Exception ex){
        Logger.error(this.getClass(), "Could not parse file metadata for file : " + binFile.getAbsolutePath() + ". " +ex.getMessage());
      }
    }
    return metaMap;
  }

  /**
   * This method takes a file and uses tika to parse the metadata from it. It
   * returns a Map of the metadata
   *
   * @param binFile
   * @return
   */
  public Map<String, String> getMetaDataMap(String inode,File binFile, boolean forceMemory) {
    return getMetaDataMap(inode,binFile, null, forceMemory);
  }

//  /**
//   *
//   * @param binFile
//   * @return
//   */
//  private Parser getParser(File binFile) {
//    String mimeType = new MimetypesFileTypeMap().getContentType(binFile);
//    return getParser(binFile, mimeType);
//  }


//  private Parser getParser(File binFile, String mimeType) {
//    String[] mimeTypes = Config.getStringArrayProperty("CONTENT_PARSERS_MIMETYPES");
//    String[] parsers = Config.getStringArrayProperty("CONTENT_PARSERS");
//    int index = Arrays.binarySearch(mimeTypes, mimeType);
//    if (index > -1 && parsers.length > 0) {
//      String parserClassName = parsers[index];
//      Class<Parser> parserClass;
//      try {
//        parserClass = (Class<Parser>) Class.forName(parserClassName);
//        return parserClass.newInstance();
//      } catch (Exception e) {
//        Logger.warn(this.getClass(), "A content parser for mime type " + mimeType
//            + " was found but could not be instantiated, using default content parser.");
//      }
//    }
//    return new AutoDetectParser();
//  }




  /**
   * normalize metadata from various filetypes this method will return an
   * array of metadata keys that we can use to normalize the values in our
   * fileAsset metadata For example, tiff:ImageLength = "height" for image
   * files, so we return {"tiff:ImageLength", "height"} and both metadata are
   * written to our metadata field
   *
   * @param key
   * @return
   */
  private String[] translateKey(String key) {
    String[] x = getTranslationMap().get(key);
    if (x == null) {
      x = new String[] { StringUtils.sanitizeCamelCase(key) };
    }
    return x;
  }

  private Map<String, String[]> translateMeta = null;

  private Map<String, String[]> getTranslationMap() {
    if (translateMeta == null) {
      synchronized ("translateMeta".intern()) {
        if (translateMeta == null) {
          translateMeta = new HashMap<String, String[]>();
          translateMeta.put("tiff:ImageWidth", new String[] { "tiff:ImageWidth", "width" });
          translateMeta.put("tiff:ImageLength", new String[] { "tiff:ImageLength", "height" });
        }
      }
    }
    return translateMeta;
  }
}
TOP

Related Classes of com.dotcms.tika.TikaUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.