Package org.elasticsearch.plugin.mapper.attachments.test

Source Code of org.elasticsearch.plugin.mapper.attachments.test.StandaloneTest$TikaTest

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.plugin.mapper.attachments.test;

import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.cli.CliTool;
import org.elasticsearch.common.cli.CliToolConfig;
import org.elasticsearch.common.cli.Terminal;
import org.elasticsearch.common.cli.commons.CommandLine;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
import org.elasticsearch.index.mapper.xcontent.MapperTestUtils;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Locale;

import static org.elasticsearch.common.cli.CliToolConfig.Builder.cmd;
import static org.elasticsearch.common.cli.CliToolConfig.Builder.option;
import static org.elasticsearch.common.io.Streams.copyToByteArray;
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;

/**
* This class provides a simple main class which can be used to test what is extracted from a given binary file.
* You can run it using
*  -u file://URL/TO/YOUR/DOC
*  --size set extracted size (default to mapper attachment size)
*  BASE64 encoded binary
*
* Example:
*  StandaloneTest BASE64Text
*  StandaloneTest -u /tmp/mydoc.pdf
*  StandaloneTest -u /tmp/mydoc.pdf --size 1000000
*/
public class StandaloneTest extends CliTool {

    private static final CliToolConfig CONFIG = CliToolConfig.config("tika", StandaloneTest.class)
                        .cmds(TikaTest.CMD)
                .build();

    static class TikaTest extends Command {
        private static final String NAME = "tika";
        private final String url;
        private final Integer size;
        private final String base64text;
        private final DocumentMapper docMapper;

        private static final CliToolConfig.Cmd CMD = cmd(NAME, TikaTest.class)
                .options(option("u", "url").required(false).hasArg(false))
                .options(option("t", "size").required(false).hasArg(false))
                .build();

        protected TikaTest(Terminal terminal, String url, Integer size, String base64text) throws IOException {
            super(terminal);
            this.size = size;
            this.url = url;
            this.base64text = base64text;
            DocumentMapperParser mapperParser = MapperTestUtils.newMapperParser();
            mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());

            String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
            docMapper = mapperParser.parse(mapping);
        }

        @Override
        public ExitStatus execute(Settings settings, Environment env) throws Exception {
            XContentBuilder builder = jsonBuilder().startObject().field("_id", 1).field("file").startObject();

            if (base64text != null) {
                // If base64 is provided
                builder.field("_content", base64text);
            } else {
                // A file is provided
                File file = new File(new URL(url).getFile());
                boolean exists = file.exists();
                if (!exists) {
                    return ExitStatus.IO_ERROR;
                }

                byte[] bytes = copyToByteArray(file);
                builder.field("_content", bytes);
            }

            if (size >= 0) {
                builder.field("_indexed_chars", size);
            }

            BytesReference json = builder.endObject().endObject().bytes();

            ParseContext.Document doc = docMapper.parse(json).rootDoc();

            terminal.println("## Extracted text");
            terminal.println("--------------------- BEGIN -----------------------");
            terminal.println("%s", doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()));
            terminal.println("---------------------- END ------------------------");
            terminal.println("## Metadata");
            printMetadataContent(doc, AttachmentMapper.FieldNames.AUTHOR);
            printMetadataContent(doc, AttachmentMapper.FieldNames.CONTENT_LENGTH);
            printMetadataContent(doc, AttachmentMapper.FieldNames.CONTENT_TYPE);
            printMetadataContent(doc, AttachmentMapper.FieldNames.DATE);
            printMetadataContent(doc, AttachmentMapper.FieldNames.KEYWORDS);
            printMetadataContent(doc, AttachmentMapper.FieldNames.LANGUAGE);
            printMetadataContent(doc, AttachmentMapper.FieldNames.NAME);
            printMetadataContent(doc, AttachmentMapper.FieldNames.TITLE);

            return ExitStatus.OK;
        }

        private void printMetadataContent(ParseContext.Document doc, String field) {
            terminal.println("- %s: %s", field, doc.get(docMapper.mappers().smartName("file." + field).mapper().names().indexName()));
        }

        public static Command parse(Terminal terminal, CommandLine cli) throws IOException {
            String url = cli.getOptionValue("u");
            String base64text = null;
            String sSize = cli.getOptionValue("size");
            Integer size = sSize != null ? Integer.parseInt(sSize) : -1;
            if (url == null && cli.getArgs().length == 0) {
                    return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided (type -h for help)");
            }
            if (url == null) {
                if (cli.getArgs().length == 0) {
                    return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided (type -h for help)");
                }
                base64text = cli.getArgs()[0];
            } else {
                if (cli.getArgs().length == 1) {
                    return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided. Not both. (type -h for help)");
                }
            }
            return new TikaTest(terminal, url, size, base64text);
        }
    }

    public StandaloneTest() {
        super(CONFIG);
    }


    public static void main(String[] args) {
        StandaloneTest pluginManager = new StandaloneTest();
        pluginManager.execute(args);
    }

    @Override
    protected Command parse(String cmdName, CommandLine cli) throws Exception {
        switch (cmdName.toLowerCase(Locale.ROOT)) {
            case TikaTest.NAME: return TikaTest.parse(terminal, cli);
            default:
                    assert false : "can't get here as cmd name is validated before this method is called";
                    return exitCmd(ExitStatus.CODE_ERROR);
        }
    }
}
TOP

Related Classes of org.elasticsearch.plugin.mapper.attachments.test.StandaloneTest$TikaTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.