Package org.modeshape.sequencer.msoffice

Source Code of org.modeshape.sequencer.msoffice.MSOfficeMetadataSequencer$MimeTypeConstants

/*
* ModeShape (http://www.modeshape.org)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*       http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.modeshape.sequencer.msoffice;

import static org.modeshape.jcr.api.JcrConstants.JCR_MIME_TYPE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.AUTHOR;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.CHARACTERS;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.COMMENT;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.CREATED;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.CREATING_APPLICATION;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.EXCEL_SHEET;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.EXCEL_SHEET_NODE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.FULL_CONTENT;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.HEADING_LEVEL;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.HEADING_NAME;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.HEADING_NODE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.KEYWORDS;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.LAST_PRINTED;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.METADATA_NODE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.NOTES;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.PAGES;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.REVISION;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.SAVED;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.SHEET_NAME;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.SLIDE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.SLIDE_NODE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.SUBJECT;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.TEMPLATE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.TEXT;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.THUMBNAIL;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.TITLE;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.TOTAL_EDITING_TIME;
import static org.modeshape.sequencer.msoffice.MSOfficeMetadataLexicon.WORDS;
import java.io.IOException;
import java.io.InputStream;
import javax.jcr.NamespaceRegistry;
import javax.jcr.Node;
import javax.jcr.Property;
import javax.jcr.RepositoryException;
import javax.jcr.Value;
import org.modeshape.common.util.CheckArg;
import org.modeshape.jcr.api.Binary;
import org.modeshape.jcr.api.nodetype.NodeTypeManager;
import org.modeshape.jcr.api.sequencer.Sequencer;
import org.modeshape.sequencer.msoffice.excel.ExcelMetadata;
import org.modeshape.sequencer.msoffice.excel.ExcelMetadataReader;
import org.modeshape.sequencer.msoffice.excel.ExcelSheetMetadata;
import org.modeshape.sequencer.msoffice.powerpoint.PowerPointMetadataReader;
import org.modeshape.sequencer.msoffice.powerpoint.PowerpointMetadata;
import org.modeshape.sequencer.msoffice.powerpoint.SlideMetadata;
import org.modeshape.sequencer.msoffice.word.WordMetadata;
import org.modeshape.sequencer.msoffice.word.WordMetadataReader;

/**
* A sequencer that processes the content of an MS Office document, extracts the metadata for the file, and then writes that
* metadata to the repository.
* <p>
* This sequencer produces data that corresponds to the following structure:
* <ul>
* <li><strong>msoffice:metadata</strong> node of type <code>msoffice:metadata</code>
* <ul>
* <li><strong>msoffice:title</strong> optional string property for the title of the documnt</li>
* <li><strong>msoffice:subject</strong> optional string property for the subject of the document</li>
* <li><strong>msoffice:author</strong> optional string property for the author of the document</li>
* <li><strong>msoffice:keywords</strong> optional string property for the document keywords</li>
* <li><strong>msoffice:comment</strong> optional string property for the document comment</li>
* <li><strong>msoffice:template</strong> optional string property for the template from which this document originates</li>
* <li><strong>msoffice:last_saved_by</strong> optional string property for the person that last saved this document</li>
* <li><strong>msoffice:revision</strong> optional string property for this document revision</li>
* <li><strong>msoffice:total_editing_time</strong> optional long property for the length this document has been edited</li>
* <li><strong>msoffice:last_printed</strong> optional date property for the date of last printing this document</li>
* <li><strong>msoffice:created</strong> date property for the date of creation of the document</li>
* <li><strong>msoffice:saved</strong> date property for the date of last save of this document</li>
* <li><strong>msoffice:pages</strong> long property for the number of pages of this document</li>
* <li><strong>msoffice:words</strong> long property for the number of words in this document</li>
* <li><strong>msoffice:characters</strong> long property for the number of characters in this document</li>
* <li><strong>msoffice:creating_application</strong> string property for the application used to create this document</li>
* <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbanail of this document</li>
* <li><strong>msoffice:full_contents</strong> optional String property holding the text contents of an excel file</li>
* <li><strong>msoffice:sheet_name</strong> optional String property for the name of a sheet in excel (multiple)</li>
* </ul>
* </li>
* <li><strong>msoffice:slide</strong> node of type <code>msoffice:pptslide</code>
* <ul>
* <li><strong>msoffice:title</strong> optional String property for the title of a slide</li>
* <li><strong>msoffice:notes</strong> optional String property for the notes of a slide</li>
* <li><strong>msoffice:text</strong> optional String property for the text of a slide</li>
* <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbnail of a slide (PNG image)</li>
* </ul>
* </li>
* </ul>
* </p>
*/
public class MSOfficeMetadataSequencer extends Sequencer {

    public static final class MimeTypeConstants {
        public static final String MICROSOFT_APPLICATION_MS_WORD = "application/msword";
        public static final String MICROSOFT_WORD = "application/vnd.ms-word";
        public static final String MICROSOFT_EXCEL = "application/vnd.ms-excel";
        public static final String MICROSOFT_POWERPOINT = "application/vnd.ms-powerpoint";
    }

    @Override
    public void initialize( NamespaceRegistry registry,
                            NodeTypeManager nodeTypeManager ) throws RepositoryException, IOException {
        registerNodeTypes("msoffice.cnd", nodeTypeManager, true);
        registerDefaultMimeTypes(MimeTypeConstants.MICROSOFT_EXCEL,
                                 MimeTypeConstants.MICROSOFT_POWERPOINT,
                                 MimeTypeConstants.MICROSOFT_WORD,
                                 MimeTypeConstants.MICROSOFT_APPLICATION_MS_WORD);
    }

    @Override
    public boolean execute( Property inputProperty,
                            Node outputNode,
                            Context context ) throws Exception {
        Binary binaryValue = (Binary)inputProperty.getBinary();
        CheckArg.isNotNull(binaryValue, "binary");

        String inputFileName = getInputFileName(inputProperty);
        String mimeType = binaryValue.getMimeType(inputFileName);

        Node sequencedNode = outputNode;
        if (outputNode.isNew()) {
            outputNode.setPrimaryType(METADATA_NODE);
        } else {
            sequencedNode = outputNode.addNode(METADATA_NODE, METADATA_NODE);
        }

        setProperty(sequencedNode, JCR_MIME_TYPE, mimeType);
        if (isPowerpoint(mimeType)) {
            try (InputStream stream = binaryValue.getStream()) {
                sequencePowerpoint(sequencedNode, context.valueFactory(), stream);
                return true;
            }
        }

        if (isWord(mimeType)) {
            try (InputStream stream = binaryValue.getStream()) {
                sequenceWord(sequencedNode, context.valueFactory(), stream);
                return true;
            }
        }

        if (isExcel(mimeType)) {
            try (InputStream stream = binaryValue.getStream()) {
                sequenceExcel(sequencedNode, context.valueFactory(), stream);
                return true;
            }
        }

        getLogger().warn("Unknown mimetype: {0} for microsoft office", mimeType);
        return false;
    }

    private String getInputFileName( Property inputProperty ) throws RepositoryException {
        return inputProperty.getParent().getParent().getName();
    }

    private boolean isExcel( String mimeType ) {
        return MimeTypeConstants.MICROSOFT_EXCEL.equalsIgnoreCase(mimeType);
    }

    private void sequenceExcel( Node sequencedNode,
                                org.modeshape.jcr.api.ValueFactory valueFactory,
                                InputStream stream ) throws IOException, RepositoryException {
        ExcelMetadata excelMetadata = ExcelMetadataReader.instance(stream);
        recordMetadata(sequencedNode, valueFactory, excelMetadata.getMetadata());
        setProperty(sequencedNode, FULL_CONTENT, excelMetadata.getText());

        for (ExcelSheetMetadata sheetMetadata : excelMetadata.getSheets()) {
            Node sheet = sequencedNode.addNode(EXCEL_SHEET, EXCEL_SHEET_NODE);
            setProperty(sheet, SHEET_NAME, sheetMetadata.getName());
            setProperty(sheet, TEXT, sheetMetadata.getText());
        }
    }

    private boolean isWord( String mimeType ) {
        // See http://blogs.msdn.com/b/vsofficedeveloper/archive/2008/05/08/office-2007-open-xml-mime-types.aspx
        return MimeTypeConstants.MICROSOFT_WORD.equalsIgnoreCase(mimeType)
               || MimeTypeConstants.MICROSOFT_APPLICATION_MS_WORD.equalsIgnoreCase(mimeType);
    }

    private void sequenceWord( Node rootNode,
                               org.modeshape.jcr.api.ValueFactory valueFactory,
                               InputStream stream ) throws RepositoryException, IOException {
        // Sometime in the future this will sequence WORD Table of contents.
        WordMetadata wordMetadata = WordMetadataReader.instance(stream);
        recordMetadata(rootNode, valueFactory, wordMetadata.getMetadata());

        for (WordMetadata.WordHeading headingMetadata : wordMetadata.getHeadings()) {
            Node heading = rootNode.addNode(HEADING_NODE, HEADING_NODE);
            setProperty(heading, HEADING_NAME, headingMetadata.getText());
            setProperty(heading, HEADING_LEVEL, headingMetadata.getHeaderLevel());
        }
    }

    private boolean isPowerpoint( String mimeType ) {
        return MimeTypeConstants.MICROSOFT_POWERPOINT.equalsIgnoreCase(mimeType);
    }

    private void sequencePowerpoint( Node rootNode,
                                     org.modeshape.jcr.api.ValueFactory valueFactory,
                                     InputStream stream ) throws IOException, RepositoryException {
        PowerpointMetadata deck = PowerPointMetadataReader.instance(stream);
        recordMetadata(rootNode, valueFactory, deck.getMetadata());

        for (SlideMetadata slideMetadata : deck.getSlides()) {
            Node slide = rootNode.addNode(SLIDE, SLIDE_NODE);
            setProperty(slide, TITLE, slideMetadata.getTitle());
            setProperty(slide, TEXT, slideMetadata.getText());
            setProperty(slide, NOTES, slideMetadata.getNotes());
            setProperty(slide, THUMBNAIL, valueFactory.createBinary(slideMetadata.getThumbnail()));
        }
    }

    private void recordMetadata( Node rootNode,
                                 org.modeshape.jcr.api.ValueFactory valueFactory,
                                 MSOfficeMetadata metadata ) throws RepositoryException {
        setProperty(rootNode, TITLE, metadata.getTitle());
        setProperty(rootNode, SUBJECT, metadata.getSubject());
        setProperty(rootNode, AUTHOR, metadata.getAuthor());
        setProperty(rootNode, KEYWORDS, metadata.getKeywords());
        setProperty(rootNode, COMMENT, metadata.getComment());
        setProperty(rootNode, TEMPLATE, metadata.getTemplate());
        setProperty(rootNode, SAVED, valueFactory.createValue(metadata.getLastSaved()));
        setProperty(rootNode, REVISION, metadata.getRevision());
        setProperty(rootNode, TOTAL_EDITING_TIME, metadata.getTotalEditingTime());
        setProperty(rootNode, LAST_PRINTED, valueFactory.createValue(metadata.getLastPrinted()));
        setProperty(rootNode, CREATED, valueFactory.createValue(metadata.getCreated()));
        setProperty(rootNode, PAGES, metadata.getPages());
        setProperty(rootNode, WORDS, metadata.getWords());
        setProperty(rootNode, CHARACTERS, metadata.getCharacters());
        setProperty(rootNode, CREATING_APPLICATION, metadata.getCreatingApplication());
        setProperty(rootNode, THUMBNAIL, valueFactory.createBinary(metadata.getThumbnail()));
    }

    private void setProperty( Node node,
                              String propertyName,
                              String value ) throws RepositoryException {
        if (value != null) {
            node.setProperty(propertyName, value);
        }
    }

    private void setProperty( Node node,
                              String propertyName,
                              Value value ) throws RepositoryException {
        if (value != null) {
            node.setProperty(propertyName, value);
        }
    }

    private void setProperty( Node node,
                              String propertyName,
                              Binary value ) throws RepositoryException {
        if (value != null) {
            node.setProperty(propertyName, value);
        }
    }

    // Intentionally use the Long object form, in case this is called by methods that return a null Long reference
    // for optional values
    private void setProperty( Node node,
                              String propertyName,
                              Long value ) throws RepositoryException {
        if (value != null) {
            node.setProperty(propertyName, value.longValue());
        }
    }

    // Intentionally use the Integer object form, in case this is called by methods that return a null Integer reference
    // for optional values
    private void setProperty( Node node,
                              String propertyName,
                              Integer value ) throws RepositoryException {
        if (value != null) {
            node.setProperty(propertyName, value.longValue());
        }
    }

}
TOP

Related Classes of org.modeshape.sequencer.msoffice.MSOfficeMetadataSequencer$MimeTypeConstants

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.