Package org.apache.tika.cli

Source Code of org.apache.tika.cli.TikaCLI

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.cli;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.PrintStream;
import java.net.URL;
import java.util.Arrays;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.SimpleLayout;
import org.apache.log4j.WriterAppender;
import org.apache.tika.gui.TikaGUI;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;

/**
* Simple command line interface for Apache Tika.
*/
public class TikaCLI {

    public static void main(String[] args) throws Exception {
        BasicConfigurator.configure(
                new WriterAppender(new SimpleLayout(), System.err));
        Logger.getRootLogger().setLevel(Level.INFO);

        TikaCLI cli = new TikaCLI();
        for (int i = 0; i < args.length; i++) {
            cli.process(args[i]);
        }
        if (args.length == 0) {
            cli.process("-");
        }
    }

    private Parser parser;

    private Metadata metadata;

    private ContentHandler handler;

    public TikaCLI() throws TransformerConfigurationException {
        parser = new AutoDetectParser();
        handler = getXmlContentHandler();
    }

    public void process(String arg) throws Exception {
        if (arg.equals("-?") || arg.equals("--help")) {
            usage();
        } else if (arg.equals("-v") || arg.equals("--verbose")) {
            Logger.getRootLogger().setLevel(Level.DEBUG);
        } else if (arg.equals("-g") || arg.equals("--gui")) {
            TikaGUI.main(new String[0]);
        } else if (arg.equals("-x") || arg.equals("--xml")) {
            handler = getXmlContentHandler();
        } else if (arg.equals("-h") || arg.equals("--html")) {
            handler = getHtmlContentHandler();
        } else if (arg.equals("-t") || arg.equals("--text")) {
            handler = getTextContentHandler();
        } else if (arg.equals("-m") || arg.equals("--metadata")) {
            handler = getMetadataContentHandler();
        } else {
            metadata = new Metadata();
            if (arg.equals("-")) {
                parser.parse(System.in, handler, metadata);
            } else {
                InputStream input;
                File file = new File(arg);
                if (file.isFile()) {
                    metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
                    input = new FileInputStream(file);
                } else {
                    URL url = new URL(arg);
                    String path = url.getPath();
                    int slash = path.lastIndexOf('/');
                    String name = path.substring(slash + 1);
                    if (name.length() > 0) {
                        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
                    }
                    input = url.openStream();
                }
                try {
                    parser.parse(input, handler, metadata);
                } finally {
                    input.close();
                }
            }
        }
    }

    private void usage() {
        PrintStream out = System.out;
        out.println("usage: tika [option] file");
        out.println();
        out.println("Options:");
        out.println("    -? or --help       Print this usage message");
        out.println("    -v or --verbose    Print debug level messages");
        out.println("    -g or --gui        Start the Apache Tika GUI");
        out.println("    -x or --xml        Output XHTML content (default)");
        out.println("    -h or --html       Output HTML content");
        out.println("    -t or --text       Output plain text content");
        out.println("    -m or --metadata   Output only metadata");
        out.println();
        out.println("Description:");
        out.println("    Apache Tika will parse the file(s) specified on the");
        out.println("    command line and output the extracted text content");
        out.println("    or metadata to standard output.");
        out.println();
        out.println("    Instead of a file name you can also specify the URL");
        out.println("    of a document to be parsed.");
        out.println();
        out.println("    Use \"-\" as the file name to parse the standard");
        out.println("    input stream.");
        out.println();
        out.println("    Use the \"--gui\" (or \"-g\") option to start");
        out.println("    the Apache Tika GUI. You can drag and drop files");
        out.println("    from a normal file explorer to the GUI window to");
        out.println("    extract text content and metadata from the files.");
    }

    private ContentHandler getXmlContentHandler()
            throws TransformerConfigurationException {
        SAXTransformerFactory factory = (SAXTransformerFactory)
            SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(System.out));
        return handler;
    }

    private ContentHandler getHtmlContentHandler()
            throws TransformerConfigurationException {
        SAXTransformerFactory factory = (SAXTransformerFactory)
        SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(System.out));
        return handler;
    }

    private ContentHandler getTextContentHandler() {
        return new BodyContentHandler(System.out);
    }

    private ContentHandler getMetadataContentHandler() {
        return new DefaultHandler() {
            public void endDocument() {
                String[] names = metadata.names();
                Arrays.sort(names);
                for (String name : names) {
                    System.out.println(name + ": " + metadata.get(name));
                }
            }
        };
    }

}
TOP

Related Classes of org.apache.tika.cli.TikaCLI

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.