/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.util;
// JDK imports
import java.io.File;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
// Tika imports
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
// Slf4j logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
// imported for Javadoc
import org.apache.nutch.protocol.ProtocolOutput;
/**
* @author mattmann
* @since NUTCH-608
*
* <p>
* This is a facade class to insulate Nutch from its underlying Mime Type
* substrate library, <a href="http://incubator.apache.org/tika/">Apache Tika</a>.
* Any mime handling code should be placed in this utility class, and hidden
* from the Nutch classes that rely on it.
* </p>
*/
public final class MimeUtil {
private static final String SEPARATOR = ";";
/* our Tika mime type registry */
private MimeTypes mimeTypes;
/* the tika detectors */
private Tika tika;
/* whether or not magic should be employed or not */
private boolean mimeMagic;
/* our log stream */
private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class.getName());
public MimeUtil(Configuration conf) {
tika = new Tika();
ObjectCache objectCache = ObjectCache.get(conf);
MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class
.getName());
if (mimeTypez == null) {
try {
String customMimeTypeFile = conf.get("mime.types.file");
if (customMimeTypeFile!=null && customMimeTypeFile.equals("")==false){
try {
mimeTypez = MimeTypesFactory.create(conf
.getConfResourceAsInputStream(customMimeTypeFile));
}
catch (Exception e){
LOG.error("Can't load mime.types.file : "+customMimeTypeFile+" using Tika's default");
}
}
if (mimeTypez==null)
mimeTypez = MimeTypes.getDefaultMimeTypes();
} catch (Exception e) {
LOG.error("Exception in MimeUtil "+e.getMessage());
throw new RuntimeException(e);
}
objectCache.setObject(MimeTypes.class.getName(), mimeTypez);
}
this.mimeTypes = mimeTypez;
this.mimeMagic = conf.getBoolean("mime.type.magic", true);
}
/**
* Cleans a {@link MimeType} name by removing out the actual {@link MimeType},
* from a string of the form:
*
* <pre>
* <primary type>/<sub type> ; < optional params
* </pre>
*
* @param origType
* The original mime type string to be cleaned.
* @return The primary type, and subtype, concatenated, e.g., the actual mime
* type.
*/
public static String cleanMimeType(String origType) {
if (origType == null)
return null;
// take the origType and split it on ';'
String[] tokenizedMimeType = origType.split(SEPARATOR);
if (tokenizedMimeType.length > 1) {
// there was a ';' in there, take the first value
return tokenizedMimeType[0];
} else {
// there wasn't a ';', so just return the orig type
return origType;
}
}
/**
* A facade interface to trying all the possible mime type resolution
* strategies available within Tika. First, the mime type provided in
* <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
* Then the cleaned mime type is looked up in the underlying Tika
* {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is
* found, then that mime type is used, otherwise URL resolution is
* used to try and determine the mime type. If that means is unsuccessful, and
* if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
* then mime type magic resolution is used to try and obtain a
* better-than-the-default approximation of the {@link MimeType}.
*
* @param typeName
* The original mime type, returned from a {@link ProtocolOutput}.
* @param url
* The given @see url, that Nutch was trying to crawl.
* @param data
* The byte data, returned from the crawl, if any.
* @return The correctly, automatically guessed {@link MimeType} name.
*/
public String autoResolveContentType(String typeName, String url, byte[] data) {
String retType = null;
String magicType = null;
MimeType type = null;
String cleanedMimeType = null;
try {
cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes
.forName(MimeUtil.cleanMimeType(typeName)).getName()
: null;
} catch (MimeTypeException mte) {
// Seems to be a malformed mime type name...
}
// first try to get the type from the cleaned type name
try {
type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
: null;
} catch (MimeTypeException e) {
type = null;
}
// if returned null, or if it's the default type then try url resolution
if (type == null
|| (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
// If no mime-type header, or cannot find a corresponding registered
// mime-type, then guess a mime-type from the url pattern
try {
TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
Tika tika = new Tika(tikaConfig);
retType = tika.detect(url) != null ? tika.detect(url) : null;
} catch (Exception e) {
String message = "Problem loading default Tika configuration";
LOG.error(message, e);
throw new RuntimeException(e);
}
} else {
retType = type.getName();
}
// if magic is enabled use mime magic to guess if the mime type returned
// from the magic guess is different than the one that's already set so far
// if it is, and it's not the default mime type, then go with the mime type
// returned by the magic
if (this.mimeMagic) {
magicType = tika.detect(data);
// Deprecated in Tika 1.0 See https://issues.apache.org/jira/browse/NUTCH-1230
//MimeType magicType = this.mimeTypes.getMimeType(data);
if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
&& !magicType.equals(MimeTypes.PLAIN_TEXT)
&& retType != null && !retType.equals(magicType)) {
// If magic enabled and the current mime type differs from that of the
// one returned from the magic, take the magic mimeType
retType = magicType;
}
// if type is STILL null after all the resolution strategies, go for the
// default type
if (retType == null) {
try {
retType = MimeTypes.OCTET_STREAM;
} catch (Exception ignore) {
}
}
}
return retType;
}
/**
* Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)}
* method.
*
* @param url
* A string representation of the document {@link URL} to sense the
* {@link MimeType} for.
* @return An appropriate {@link MimeType}, identified from the given
* Document url in string form.
*/
public String getMimeType(String url) {
return tika.detect(url);
}
/**
* A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
* method.
*
* @param name
* The name of a valid {@link MimeType} in the Tika mime registry.
* @return The object representation of the {@link MimeType}, if it exists,
* or null otherwise.
*/
public String forName(String name) {
try {
return this.mimeTypes.forName(name).toString();
} catch (MimeTypeException e) {
LOG.error("Exception getting mime type by name: [" + name
+ "]: Message: " + e.getMessage());
return null;
}
}
/**
* Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
* method.
*
* @param f
* The {@link File} to sense the {@link MimeType} for.
* @return The {@link MimeType} of the given {@link File}, or null if it
* cannot be determined.
*/
public String getMimeType(File f) {
try {
return tika.detect(f);
} catch (Exception e) {
LOG.error("Exception getting mime type for file: [" + f.getPath()
+ "]: Message: " + e.getMessage());
return null;
}
}
}