Package org.apache.tika.cli

Source Code of org.apache.tika.cli.TikaCLI$OutputType

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.cli;

import java.io.File;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.URL;
import java.util.Arrays;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.SimpleLayout;
import org.apache.log4j.WriterAppender;
import org.apache.tika.gui.TikaGUI;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.MetadataHelper;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;

/**
* Simple command line interface for Apache Tika.
*/
public class TikaCLI {

    public static void main(String[] args) throws Exception {
        BasicConfigurator.configure(
                new WriterAppender(new SimpleLayout(), System.err));
        Logger.getRootLogger().setLevel(Level.INFO);

        TikaCLI cli = new TikaCLI();
        for (int i = 0; i < args.length; i++) {
            cli.process(args[i]);
        }
        if (cli.pipeMode) {
            cli.process("-");
        }
    }

    private interface OutputType {
        ContentHandler getContentHandler() throws Exception;
    }

    private final OutputType XML = new OutputType() {
        public ContentHandler getContentHandler() throws Exception {
            return getTransformerHandler("xml", encoding);
        }
    };

    private final OutputType HTML = new OutputType() {
        public ContentHandler getContentHandler() throws Exception {
            return getTransformerHandler("html", encoding);
        }
    };

    private final OutputType TEXT = new OutputType() {
        public ContentHandler getContentHandler() throws Exception {
            return new BodyContentHandler(getSystemOutWriter(encoding));
        }
    };

    private final OutputType METADATA = new OutputType() {
        public ContentHandler getContentHandler() throws Exception {
            final PrintWriter writer =
                new PrintWriter(getSystemOutWriter(encoding));
            return new DefaultHandler() {
                public void endDocument() {
                    String[] names = metadata.names();
                    Arrays.sort(names);
                    for (String name : names) {
                        writer.println(name + ": " + metadata.get(name));
                    }

                    writer.flush();
                }
            };
        }
    };

    private ParseContext context;

    private Parser parser;

    private Metadata metadata;

    private OutputType type = XML;

    /**
     * Output character encoding, or <code>null</code> for platform default
     */
    private String encoding = null;

    private boolean pipeMode = true;

    public TikaCLI() throws TransformerConfigurationException {
        context = new ParseContext();
        parser = new AutoDetectParser();
        context.set(Parser.class, parser);
    }

    public void process(String arg) throws Exception {
        if (arg.equals("-?") || arg.equals("--help")) {
            pipeMode = false;
            usage();
        } else if (arg.equals("-v") || arg.equals("--verbose")) {
            Logger.getRootLogger().setLevel(Level.DEBUG);
        } else if (arg.equals("-g") || arg.equals("--gui")) {
            pipeMode = false;
            TikaGUI.main(new String[0]);
        } else if (arg.startsWith("-e")) {
            encoding = arg.substring("-e".length());
        } else if (arg.startsWith("--encoding=")) {
            encoding = arg.substring("--encoding=".length());
        } else if (arg.equals("-x") || arg.equals("--xml")) {
            type = XML;
        } else if (arg.equals("-h") || arg.equals("--html")) {
            type = HTML;
        } else if (arg.equals("-t") || arg.equals("--text")) {
            type = TEXT;
        } else if (arg.equals("-m") || arg.equals("--metadata")) {
            type = METADATA;
        } else {
            pipeMode = false;
            metadata = new Metadata();
            if (arg.equals("-")) {
                parser.parse(
                        System.in, type.getContentHandler(),
                        metadata, context);
            } else {
                URL url;
                File file = new File(arg);
                if (file.isFile()) {
                    url = file.toURI().toURL();
                } else {
                    url = new URL(arg);
                }
                InputStream input =
                    MetadataHelper.getInputStream(url, metadata);
                try {
                    parser.parse(
                            input, type.getContentHandler(),
                            metadata, context);
                } finally {
                    input.close();
                }
            }
        }
    }

    private void usage() {
        PrintStream out = System.out;
        out.println("usage: tika [option] [file]");
        out.println();
        out.println("Options:");
        out.println("    -?  or --help        Print this usage message");
        out.println("    -v  or --verbose     Print debug level messages");
        out.println("    -g  or --gui         Start the Apache Tika GUI");
        out.println("    -eX or --encoding=X  Use output encoding X");
        out.println("    -x  or --xml         Output XHTML content (default)");
        out.println("    -h  or --html        Output HTML content");
        out.println("    -t  or --text        Output plain text content");
        out.println("    -m  or --metadata    Output only metadata");
        out.println();
        out.println("Description:");
        out.println("    Apache Tika will parse the file(s) specified on the");
        out.println("    command line and output the extracted text content");
        out.println("    or metadata to standard output.");
        out.println();
        out.println("    Instead of a file name you can also specify the URL");
        out.println("    of a document to be parsed.");
        out.println();
        out.println("    If no file name or URL is specified (or the special");
        out.println("    name \"-\" is used), then the standard input stream");
        out.println("    is parsed.");
        out.println();
        out.println("    Use the \"--gui\" (or \"-g\") option to start");
        out.println("    the Apache Tika GUI. You can drag and drop files");
        out.println("    from a normal file explorer to the GUI window to");
        out.println("    extract text content and metadata from the files.");
    }

    /**
     * Returns a {@link System#out} writer with the given output encoding.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
     * @param encoding output encoding,
     *                 or <code>null</code> for the platform default
     * @return {@link System#out} writer
     * @throws UnsupportedEncodingException
     *         if the configured encoding is not supported
     */
    private static Writer getSystemOutWriter(String encoding)
            throws UnsupportedEncodingException {
        if (encoding != null) {
            return new OutputStreamWriter(System.out, encoding);
        } else if (System.getProperty("os.name")
                .toLowerCase().startsWith("mac os x")) {
            // TIKA-324: Override the default encoding on Mac OS X
            return new OutputStreamWriter(System.out, "UTF-8");
        } else {
            return new OutputStreamWriter(System.out);
        }
    }

    /**
     * Returns a transformer handler that serializes incoming SAX events
     * to XHTML or HTML (depending the given method) using the given output
     * encoding.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
     * @param method "xml" or "html"
     * @param encoding output encoding,
     *                 or <code>null</code> for the platform default
     * @return {@link System#out} transformer handler
     * @throws TransformerConfigurationException
     *         if the transformer can not be created
     */
    private static TransformerHandler getTransformerHandler(
            String method, String encoding)
            throws TransformerConfigurationException {
        SAXTransformerFactory factory = (SAXTransformerFactory)
                SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method);
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        if (encoding != null) {
            handler.getTransformer().setOutputProperty(
                    OutputKeys.ENCODING, encoding);
        }
        handler.setResult(new StreamResult(System.out));
        return handler;
    }

}
TOP

Related Classes of org.apache.tika.cli.TikaCLI$OutputType

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.