Source Code of org.jahia.services.content.rules.ExtractionService

/**
 * This file is part of Jahia, next-generation open source CMS:
 * Jahia's next-generation, open source CMS stems from a widely acknowledged vision
 * of enterprise application convergence - web, search, document, social and portal -
 * unified by the simplicity of web content management.
 *
 * For more information, please visit http://www.jahia.com.
 *
 * Copyright (C) 2002-2011 Jahia Solutions Group SA. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 * As a special exception to the terms and conditions of version 2.0 of
 * the GPL (or any later version), you may redistribute this Program in connection
 * with Free/Libre and Open Source Software ("FLOSS") applications as described
 * in Jahia's FLOSS exception. You should have received a copy of the text
 * describing the FLOSS exception, and it is also available here:
 * http://www.jahia.com/license
 *
 * Commercial and Supported Versions of the program (dual licensing):
 * alternatively, commercial and supported versions of the program may be used
 * in accordance with the terms and conditions contained in a separate
 * written agreement between you and Jahia Solutions Group SA.
 *
 * If you are unsure which license is appropriate for your use,
 * please contact the sales department at sales@jahia.com.
 */


package org.jahia.services.content.rules;


import java.io.IOException;
import java.io.InputStream;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.Map;


import javax.jcr.Node;
import javax.jcr.PathNotFoundException;
import javax.jcr.RepositoryException;
import javax.jcr.UnsupportedRepositoryOperationException;


import org.apache.commons.lang.StringUtils;
import org.apache.jackrabbit.value.BinaryImpl;
import org.slf4j.Logger;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.drools.WorkingMemory;
import org.drools.spi.KnowledgeHelper;
import org.jahia.api.Constants;
import org.jahia.services.content.JCRCallback;
import org.jahia.services.content.JCRNodeWrapper;
import org.jahia.services.content.JCRSessionWrapper;
import org.jahia.services.content.JCRStoreProvider;
import org.jahia.services.content.JCRTemplate;
import org.jahia.services.content.decorator.JCRFileContent;
import org.jahia.services.textextraction.TextExtractionService;
import org.jahia.settings.SettingsBean;


/**
 * Jahia text extraction service that uses Apache Tika parsers to extract content from documents.
 * User: toto
 * Date: Jan 9, 2009
 * Time: 6:33:45 PM
 */
public class ExtractionService {


    private static ExtractionService instance;
    
    private static Logger logger = org.slf4j.LoggerFactory.getLogger(ExtractionService.class);


    public static ExtractionService getInstance() {
        if (instance == null) {
            instance = new ExtractionService();
        }
        return instance;
    }


    private JCRTemplate jcrTemplate;
    
    private TextExtractionService textExtractionService;
    
    private Map<String, String[]> mapping = new HashMap<String, String[]>();


    /**
     * Performs a check if the provided node can be handled by currently
     * configured parsers.
     * 
     * 
     * @param node the node to be checked
     * @return <code>true</code> if there is a parser that can handle provided
     *         node's content
     * @throws IOException in case of the read/write errors
     */
    public boolean canHandle(JCRNodeWrapper node) throws IOException {
        if (!textExtractionService.isEnabled()) {
            return false;
        }
        String mimeType = null;
        try {
            mimeType = node.getProperty(Constants.JCR_MIMETYPE).getString();
        } catch (PathNotFoundException e) {
            // ignore
        } catch (RepositoryException e) {
            logger.warn("Unable to get mime type for the node " + node.getPath() + ". Cause: " + e.getMessage(), e);
        }
        // no mime type detected -> skip it
        if (mimeType == null) {
            return false;
        }
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, mimeType);
        
        // TODO check if it makes sense to also provide the stream


        return textExtractionService.canHandle(null, metadata);
    }


    /**
     * Extract properties from document
     * @param node file node in JCR
     * @param drools context of the rule engine
     * @throws Exception
     */
    public void extractProperties(AddedNodeFact node, KnowledgeHelper drools) throws Exception{
        if (!textExtractionService.isEnabled()) {
            if (logger.isDebugEnabled()) {
                logger.debug("Text extraction service is disabled. Skip extracting properties for node " + node.getPath());
            }
            return;
        }
        
        AddedNodeFact contentNode = node.getContent();
        if (contentNode == null) {
            return;
        }
        
        String mimeType = null;
        try {
            mimeType = contentNode.getNode().getProperty(Constants.JCR_MIMETYPE).getString();
        } catch (Exception e) {
            logger.warn("Unable to detect mime type for node " + node.getPath(), e);
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Extracting properties for document " + node.getPath() + " with mime type '" + mimeType + "'");
        }
        Metadata metadata = new Metadata();
        if (mimeType != null) {
            metadata.set(Metadata.CONTENT_TYPE, mimeType);
        }
        
        InputStream stream = null;
        try {
            stream =  contentNode.getNode().getProperty(Constants.JCR_DATA).getBinary().getStream();
            textExtractionService.extractMetadata(stream, metadata);
        } catch (Exception e) {
            if (logger.isDebugEnabled()) {
                logger.warn("Error extracting metadata for node " + node.getPath(), e);
            } else {
                logger.warn("Error extracting metadata for node " + node.getPath() + ". Cause: "
                        + e.getMessage()
                        + (e.getCause() != null ? " Original cause: " + e.getCause().getMessage() : ""));
            }
        } finally {
            IOUtils.closeQuietly(stream);
        }


        if (metadata.size() > 0) {
            try {
            WorkingMemory memory = drools.getWorkingMemory();
            for (String key : metadata.names()) {
                if (!Metadata.CONTENT_TYPE.equals(key)) {
                    // TODO handle multivalue metadata properties
                    String value = metadata.get(key); 
                    if (value != null) {
                        String[] mappedTo = mapping.get(key);
                        memory.insert(new ExtractedVariable(node.getPath(), key, value, mappedTo != null ? mappedTo[0] : null, mappedTo != null ? mappedTo[1] : null));
                    }
                }
            }
            } catch (Exception e) {
                logger.warn("Error extracting metadata for node " + node.getPath(), e);
            }
        }
    }


    /**
     * Synchronously extract text from document and store it as a property of either the file 
     * node itself or the node referred to by the extractionNodePath parameter.
     * The node must have the following properties, which will be set:
     * j:extractedText, j:lastExtractionDate and for the case of extractionNodePath being passed
     * also j:originalUuid is required. 
     * @param provider the JCR store provider
     * @param sourcePath the path to the file node
     * @param extractionNodePath the optional path to the node receiving the extracted text
     * @throws IOException
     */
    public void extractText(final JCRStoreProvider provider, final String sourcePath, final String extractionNodePath) throws IOException {


        if (!textExtractionService.isEnabled()) {
            return;
        }
        try {
            jcrTemplate.doExecuteWithSystemSession(new JCRCallback<Object>() {
                public Object doInJCR(JCRSessionWrapper session) throws RepositoryException {
                    JCRNodeWrapper file = session.getNode(sourcePath);
                    JCRFileContent fileContent = file.getFileContent();
                    Node n = file.getRealNode().getNode(Constants.JCR_CONTENT);


                    String type = fileContent.getContentType();


                    // jcr:encoding is not mandatory
                    String encoding = fileContent.getEncoding();


                    InputStream stream = fileContent.downloadFile();
                    try {
                        Metadata metadata = new Metadata();
                        metadata.set(Metadata.CONTENT_TYPE, type);
                        if (encoding != null) {
                            metadata.set(Metadata.CONTENT_ENCODING, encoding);
                        }
                        String content = textExtractionService.parse(stream, metadata);
                        if (extractionNodePath != null && extractionNodePath.length() > 0) {
                            n = session.getNode(extractionNodePath);
                            session.checkout(n);
                            try {
                                n.setProperty(Constants.ORIGINAL_UUID, file.getIdentifier());
                            } catch (UnsupportedRepositoryOperationException e) {
                                // ignore
                            }
                        } else {
                            session.checkout(n);
                        }
                        n.setProperty(Constants.EXTRACTED_TEXT, new BinaryImpl(content.getBytes(SettingsBean
                                .getInstance().getCharacterEncoding())));
                        n.setProperty(Constants.EXTRACTION_DATE, new GregorianCalendar());
                        session.save();
                    } catch (Exception e) {
                        if (logger.isDebugEnabled()) {
                            logger.warn("Cannot extract content for node " + sourcePath, e);
                        } else {
                            logger.warn("Cannot extract content for node " + sourcePath + ". Cause: "
                                    + e.getMessage()
                                    + (e.getCause() != null ? " Original cause: " + e.getCause().getMessage() : ""));
                        }
                    } finally {
                        IOUtils.closeQuietly(stream);
                    }
                    return null;
                }
            });
        } catch (RepositoryException e) {
            if (logger.isDebugEnabled()) {
                logger.debug("Cannot extract content: " + e.getMessage(), e);
            } else {
                logger.warn("Cannot extract content: " + e.getMessage());
            }
        }
    }


    /**
     * Returns <code>true</code> if the text extraction service is activated.
     * 
     * @return <code>true</code> if the text extraction service is activated
     */
    public boolean isEnabled() {
        return textExtractionService.isEnabled();
    }


    public void setJcrTemplate(JCRTemplate jcrTemplate) {
        this.jcrTemplate = jcrTemplate;
    }
    
    /**
     * @param textExtractionService the textExtractionService to set
     */
    public void setTextExtractionService(TextExtractionService textExtractionService) {
        this.textExtractionService = textExtractionService;
    }


    /**
     * @param mapping the mapping to set
     */
    public void setMapping(Map<String, String> mapping) {
        this.mapping = new HashMap<String, String[]>(mapping.size());
        for (Map.Entry<String, String> entry : mapping.entrySet()) {
            this.mapping.put(entry.getKey(), StringUtils.split(entry.getValue(), '.'));
        }
    }
}
Source Code of org.jahia.services.content.rules.ExtractionService

Related Classes of org.jahia.services.content.rules.ExtractionService