package eu.planets_project.ifr.core.services.characterisation.metadata.impl;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.URI;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;
import javax.ejb.Stateless;
import javax.jws.WebService;
import javax.xml.ws.soap.MTOM;
import nz.govt.natlib.meta.FileHarvestSource;
import nz.govt.natlib.meta.config.Config;
import nz.govt.natlib.meta.config.Configuration;
import nz.govt.natlib.meta.config.ConfigurationException;
import nz.govt.natlib.meta.ui.PropsManager;
import org.apache.commons.io.IOUtils;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import com.sun.xml.ws.developer.StreamingAttachment;
import eu.planets_project.ifr.core.techreg.formats.FormatRegistry;
import eu.planets_project.ifr.core.techreg.formats.FormatRegistryFactory;
import eu.planets_project.services.PlanetsServices;
import eu.planets_project.services.characterise.Characterise;
import eu.planets_project.services.characterise.CharacteriseResult;
import eu.planets_project.services.datatypes.DigitalObject;
import eu.planets_project.services.datatypes.Parameter;
import eu.planets_project.services.datatypes.Property;
import eu.planets_project.services.datatypes.ServiceDescription;
import eu.planets_project.services.datatypes.ServiceDescription.Builder;
import eu.planets_project.services.datatypes.ServiceReport;
import eu.planets_project.services.datatypes.ServiceReport.Status;
import eu.planets_project.services.datatypes.ServiceReport.Type;
import eu.planets_project.services.datatypes.Tool;
import eu.planets_project.services.utils.DigitalObjectUtils;
import eu.planets_project.services.utils.ServiceUtils;
/**
* Service wrapping the Metadata Extraction Tool from the National Archive of New Zealand
* (http://meta-extractor.sourceforge.net/).
* @author Fabian Steeg (fabian.steeg@uni-koeln.de)
*/
@Stateless
@MTOM
@StreamingAttachment( parseEagerly=true, memoryThreshold=ServiceUtils.JAXWS_SIZE_THRESHOLD )
@WebService(name = MetadataExtractor.NAME, serviceName = Characterise.NAME, targetNamespace =
PlanetsServices.NS, endpointInterface = "eu.planets_project.services.characterise.Characterise")
public final class MetadataExtractor implements Characterise {
static final String NAME = "MetadataExtractor";
static final String NZME_PROPERTY_ROOT = "planets:pc/nzme/";
/**
* @param name The property name
* @return A property URI for the given name
*/
static URI makePropertyURI(final String name) {
return URI.create(NZME_PROPERTY_ROOT + name);
}
/**
* The optional format XCEL and parameters are ignored in this implementation (you may pass
* null). {@inheritDoc}
* @see eu.planets_project.services.characterise.Characterise#characterise(eu.planets_project.services.datatypes.DigitalObject,
* java.lang.String, eu.planets_project.services.datatypes.Parameter)
*/
public CharacteriseResult characterise(final DigitalObject digitalObject,
final List<Parameter> parameters) {
String resultString = basicCharacteriseOneBinary(digitalObject);
List<Property> props = readProperties(resultString);
return new CharacteriseResult(props, new ServiceReport(Type.INFO, Status.SUCCESS, "OK"));
}
/**
* Property listing is not yet implemented for this class, the resulting list will always be
* empty. {@inheritDoc}
* @see eu.planets_project.services.characterise.Characterise#listProperties(java.net.URI)
*/
public List<Property> listProperties(final URI formatURI) {
ArrayList<Property> result = new ArrayList<Property>();
/* Get the extensions for the supplied Pronom ID: */
FormatRegistry registry = FormatRegistryFactory.getFormatRegistry();
Set<String> extensions = registry.getExtensions(formatURI);
/* Find the corresponding metadata file type: */
MetadataType[] types = MetadataType.values();
for (MetadataType metadataType : types) {
String[] split = metadataType.sample.split("\\.");
String suffix = split[split.length - 1];
if (extensions.contains(suffix.toLowerCase())) {
/* For that, get the extractable properties: */
List<String> listProperties = listProperties(metadataType);
for (String string : listProperties) {
result.add(new Property(makePropertyURI(string), string, null));
}
}
}
return result;
}
/**
* {@inheritDoc}
* @see eu.planets_project.services.PlanetsService#describe()
*/
public ServiceDescription describe() {
/*
* Gather all supported input formats using the tech reg and the types enum:
*/
FormatRegistry formatRegistry = FormatRegistryFactory.getFormatRegistry();
List<URI> inputFormats = new ArrayList<URI>();
MetadataType[] metadataTypes = MetadataType.values();
for (MetadataType metadataType : metadataTypes) {
/*
* We use the sample file extension instead of the mime type, as the latter is
* file/unknown for many types (it's what the tool returns as a result, used for
* testing)
*/
String[] split = metadataType.sample.split("\\.");
String extension = split[split.length - 1];
inputFormats.addAll(formatRegistry.getUrisForExtension(extension));
}
Builder builder = new ServiceDescription.Builder("New Zealand Metadata Extractor Service",
Characterise.class.getName());
builder.author("Fabian Steeg");
builder.classname(this.getClass().getName());
builder
.description("Metadata extraction service based on the Metadata Extraction Tool of the National "
+ "Library of New Zealand (patched 3.4GA).");
builder.serviceProvider("The Planets Consortium");
builder.tool(Tool.create(null, "New Zealand Metadata Extractor", "3.4GA (patched)", null,
"http://meta-extractor.sourceforge.net/"));
builder
.furtherInfo(URI
.create("http://sourceforge.net/tracker/index.php?func=detail&aid=2027729&group_id=189407"
+ "&atid=929202"));
builder.inputFormats(inputFormats.toArray(new URI[] {}));
return builder.build();
}
/*------------------------------------------------------------------------*/
/*-------------------------- package private API -------------------------*/
/*------------------------------------------------------------------------*/
/**
* @param metadataXml The XML string resulting from harvesting, the output of the NZ metadata
* extractor
* @return A list of properties
*/
static List<Property> readProperties(final String metadataXml) {
List<Property> properties = new ArrayList<Property>();
SAXBuilder builder = new SAXBuilder();
try {
Document doc = builder.build(new StringReader(metadataXml));
Element meta = doc.getRootElement().getChild("METADATA");
for (Object propElem : meta.getChildren()) {
Element e = (Element) propElem;
Property p = new Property(makePropertyURI(e.getName()), e.getName(), e.getText());
properties.add(p);
}
} catch (JDOMException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return properties;
}
/**
* @param type The file type
* @return A list of attributes extractable for the given type, as defined in the adapters DTD
* file
*/
static List<String> listProperties(final MetadataType type) {
List<String> props = new ArrayList<String>();
try {
File adapter = File.createTempFile("adapter", null);
/*
* We get the adapter jar from the current thread in order to work in all environments
* (e.g., when running locally as a test or when running on a server:)
*/
InputStream stream = Thread.currentThread().getContextClassLoader()
.getResourceAsStream(type.adapter);
if (stream == null) {
throw new IllegalStateException("Could not load adapter Jar: " + type.adapter);
}
// stream to a file
FileOutputStream out = new FileOutputStream(adapter);
IOUtils.copyLarge(stream, out);
out.close();
/*
* The NZ metadata extractor has an adapter jar for each supported file format. Inside
* that, there is a dtd in which the extractable properties for that format are listed.
* Thus, we iterate over the contents of the jar file, get the dtd, and read the
* properties defined inside of it:
*/
JarFile jar = new JarFile(adapter);
Enumeration<JarEntry> entries = jar.entries();
while (entries.hasMoreElements()) {
JarEntry entry = entries.nextElement();
if (entry.getName().endsWith("dtd")) {
InputStream inputStream = jar.getInputStream(entry);
Scanner s = new Scanner(inputStream);
while (s.hasNextLine()) {
String nextLine = s.nextLine();
/**
* A line we care about looks like this:
* <p/>
* <!ELEMENT COMPRESSION (#PCDATA)>
*/
if (nextLine.startsWith("<!ELEMENT")) {
String prop = nextLine.split(" ")[1];
props.add(prop);
}
}
/* It's just one DTD file: */
break;
}
}
} catch (IOException e) {
e.printStackTrace();
}
return props;
}
/*------------------------------------------------------------------------*/
/*------------------------------- private API ----------------------------*/
/*------------------------------------------------------------------------*/
/**
* @param digitalObject The binary file to characterize
* @return Returns the proprietary XML result string returned by the extractor tool
* @see eu.planets_project.services.characterise.BasicCharacteriseOneBinary#basicCharacteriseOneBinary(byte[])
*/
private String basicCharacteriseOneBinary(final DigitalObject digitalObject) {
try {
File file = DigitalObjectUtils.toFile(digitalObject);
/* Create a HarvestSource of the object we want to harvest */
FileHarvestSource source = new FileHarvestSource(file);
/* Get the native Configuration: */
Configuration c = Config.getInstance().getConfiguration("Extract in Native form");
String tempFolder = file.getParent();
c.setOutputDirectory(tempFolder);
/* Harvest the file: */
c.getHarvester().harvest(c, source, new PropsManager());
/* The resulting file is the original file plus ".xml": */
File result = new File(c.getOutputDirectory() + File.separator + file.getName()
+ ".xml");
result.deleteOnExit();
return read(result.getAbsolutePath());
} catch (ConfigurationException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* @param location The location of the text file to read
* @return Return the content of the file at the specified location
*/
private static String read(final String location) {
StringBuilder builder = new StringBuilder();
Scanner s;
try {
s = new Scanner(new File(location));
while (s.hasNextLine()) {
builder.append(s.nextLine()).append("\n");
}
return builder.toString();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return null;
}
}