Package org.apache.nutch.util

Source Code of org.apache.nutch.util.MimeUtil

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.util;

// JDK imports
import java.io.File;
import java.io.IOException;
import java.io.InputStream;

// Hadoop imports
import org.apache.hadoop.conf.Configuration;

// Tika imports
import org.apache.tika.Tika;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;

// Slf4j logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

// imported for Javadoc
import org.apache.nutch.protocol.ProtocolOutput;

/**
* @author mattmann
* @since NUTCH-608
*
* <p>
* This is a facade class to insulate Nutch from its underlying Mime Type
* substrate library, <a href="http://incubator.apache.org/tika/">Apache Tika</a>.
* Any mime handling code should be placed in this utility class, and hidden
* from the Nutch classes that rely on it.
* </p>
*/
public final class MimeUtil {

  private static final String SEPARATOR = ";";

  /* our Tika mime type registry */
  private MimeTypes mimeTypes;

  /* the tika detectors */
  private Tika tika;

  /* whether or not magic should be employed or not */
  private boolean mimeMagic;

  /* our log stream */
  private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class.getName());

  public MimeUtil(Configuration conf) {
    tika = new Tika();
    ObjectCache objectCache = ObjectCache.get(conf);
    MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class
        .getName());
    if (mimeTypez == null) {
      try {
          String customMimeTypeFile = conf.get("mime.types.file");
          if (customMimeTypeFile!=null && customMimeTypeFile.equals("")==false){
              try {
              mimeTypez = MimeTypesFactory.create(conf
                      .getConfResourceAsInputStream(customMimeTypeFile));
              }
              catch (Exception e){
                  LOG.error("Can't load mime.types.file : "+customMimeTypeFile+" using Tika's default");
              }
          }
          if (mimeTypez==null)
              mimeTypez = MimeTypes.getDefaultMimeTypes();
      } catch (Exception e) {
        LOG.error("Exception in MimeUtil "+e.getMessage());
        throw new RuntimeException(e);
      }
      objectCache.setObject(MimeTypes.class.getName(), mimeTypez);
    }
   
    this.mimeTypes = mimeTypez;
    this.mimeMagic = conf.getBoolean("mime.type.magic", true);
  }

  /**
   * Cleans a {@link MimeType} name by removing out the actual {@link MimeType},
   * from a string of the form:
   *
   * <pre>
   *      &lt;primary type&gt;/&lt;sub type&gt; ; &lt; optional params
   * </pre>
   *
   * @param origType
   *          The original mime type string to be cleaned.
   * @return The primary type, and subtype, concatenated, e.g., the actual mime
   *         type.
   */
  public static String cleanMimeType(String origType) {
    if (origType == null)
      return null;

    // take the origType and split it on ';'
    String[] tokenizedMimeType = origType.split(SEPARATOR);
    if (tokenizedMimeType.length > 1) {
      // there was a ';' in there, take the first value
      return tokenizedMimeType[0];
    } else {
      // there wasn't a ';', so just return the orig type
      return origType;
    }
  }

  /**
   * A facade interface to trying all the possible mime type resolution
   * strategies available within Tika. First, the mime type provided in
   * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
   * Then the cleaned mime type is looked up in the underlying Tika
   * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType}
   * is found, then that mime type is used, otherwise URL resolution is
   * used to try and determine the mime type. However, if
   * <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
   * then mime type magic resolution is used to try and obtain a
   * better-than-the-default approximation of the {@link MimeType}.
   *
   * @param typeName
   *          The original mime type, returned from a {@link ProtocolOutput}.
   * @param url
   *          The given @see url, that Nutch was trying to crawl.
   * @param data
   *          The byte data, returned from the crawl, if any.
   * @return The correctly, automatically guessed {@link MimeType} name.
   */
  public String autoResolveContentType(String typeName, String url, byte[] data) {
    String retType = null;
    MimeType type = null;
    String cleanedMimeType = null;

    cleanedMimeType = MimeUtil.cleanMimeType(typeName);
    // first try to get the type from the cleaned type name
    if (cleanedMimeType != null) {
      try {
        type = mimeTypes.forName(cleanedMimeType);
        cleanedMimeType = type.getName();
      } catch (MimeTypeException mte) {
        // Seems to be a malformed mime type name...
        cleanedMimeType = null;
      }
    }

    // if returned null, or if it's the default type then try url resolution
    if (type == null
        || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
      // If no mime-type header, or cannot find a corresponding registered
      // mime-type, then guess a mime-type from the url pattern
      try {
        retType = tika.detect(url) != null ? tika.detect(url) : null;
      } catch (Exception e) {
        String message = "Problem loading default Tika configuration";
        LOG.error(message, e);
        throw new RuntimeException(e);
      }
    } else {
        retType = type.getName();
    }

    // if magic is enabled use mime magic to guess if the mime type returned
    // from the magic guess is different than the one that's already set so far
    // if it is, and it's not the default mime type, then go with the mime type
    // returned by the magic
    if (this.mimeMagic) {
      String magicType = null;
      // pass URL (file name) and (cleansed) content type from protocol to Tika
      Metadata tikaMeta = new Metadata();
      tikaMeta.add(Metadata.RESOURCE_NAME_KEY, url);
      tikaMeta.add(Metadata.CONTENT_TYPE,
          (cleanedMimeType != null ? cleanedMimeType : typeName));
      try {
        InputStream stream = TikaInputStream.get(data);
        try {
          magicType = tika.detect(stream, tikaMeta);
       } finally {
         stream.close();
        }
      } catch (IOException ignore) {}

      if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
          && !magicType.equals(MimeTypes.PLAIN_TEXT)
          && retType != null && !retType.equals(magicType)) {

        // If magic enabled and the current mime type differs from that of the
        // one returned from the magic, take the magic mimeType
        retType = magicType;
      }

      // if type is STILL null after all the resolution strategies, go for the
      // default type
      if (retType == null) {
        try {
          retType = MimeTypes.OCTET_STREAM;
        } catch (Exception ignore) {
        }
      }
    }

    return retType;
  }

  /**
   * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)}
   * method.
   *
   * @param url
   *          A string representation of the document {@link URL} to sense the
   *          {@link MimeType} for.
   * @return An appropriate {@link MimeType}, identified from the given
   *         Document url in string form.
   */
  public String getMimeType(String url) {
    return tika.detect(url);
  }

  /**
   * A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
   * method.
   *
   * @param name
   *          The name of a valid {@link MimeType} in the Tika mime registry.
   * @return The object representation of the {@link MimeType}, if it exists,
   *         or null otherwise.
   */
  public String forName(String name) {
    try {
      return this.mimeTypes.forName(name).toString();
    } catch (MimeTypeException e) {
      LOG.error("Exception getting mime type by name: [" + name
          + "]: Message: " + e.getMessage());
      return null;
    }
  }

  /**
   * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
   * method.
   *
   * @param f
   *          The {@link File} to sense the {@link MimeType} for.
   * @return The {@link MimeType} of the given {@link File}, or null if it
   *         cannot be determined.
   */
  public String getMimeType(File f) {
    try {
      return tika.detect(f);
    } catch (Exception e) {
      LOG.error("Exception getting mime type for file: [" + f.getPath()
          + "]: Message: " + e.getMessage());
      return null;
    }
  }


}
TOP

Related Classes of org.apache.nutch.util.MimeUtil

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.