Source Code of org.apache.tika.utils.Utils

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.utils;


import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;


import org.apache.log4j.Logger;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;


import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;


/**
 * Class util
 * 
 * 
 */


public class Utils {


    static Logger logger = Logger.getRootLogger();


    public static List unzip(InputStream is) {
        List res = new ArrayList();
        try {
            ZipInputStream in = new ZipInputStream(is);
            ZipEntry entry = null;
            while ((entry = in.getNextEntry()) != null) {
                ByteArrayOutputStream stream = new ByteArrayOutputStream();
                byte[] buf = new byte[1024];
                int len;
                while ((len = in.read(buf)) > 0) {
                    stream.write(buf, 0, len);
                }
                InputStream isEntry = new ByteArrayInputStream(stream
                        .toByteArray());
                File file = File.createTempFile("TIKA_unzip_", "_" + entry.getName());
                
                // TODO we might want to delete the file earlier than on exit,
                // in case Tika is used inside a long-running app
                file.deleteOnExit();
                saveInputStreamInFile(isEntry, new BufferedOutputStream(
                        new FileOutputStream(file)));
                res.add(file);
                isEntry.close();
            }
            in.close();
        } catch (IOException e) {
            logger.error(e.getMessage());
        }
        return res;
    }


    private static void saveInputStreamInFile(InputStream in, OutputStream out)
            throws IOException {
        byte[] buffer = new byte[1024];
        int len;


        while ((len = in.read(buffer)) >= 0)
            out.write(buffer, 0, len);


        in.close();
        out.close();
    }


    /**
     * Try to detect encoding from inputstream and return a UTF-8
     * Reader. A metadata hint can be submitted as part of {@link Metadata}
     * under key {@link HttpHeaders#CONTENT_ENCODING}.
     * 
     * After succesfull detection, fills Metadata with detected content encoding
     * and content language ({@link HttpHeaders#CONTENT_LANGUAGE}).
     * 
     * @return Reader to utf8 encoded reader.
     */
    public static Reader getUTF8Reader(InputStream stream, Metadata metadata) throws TikaException, IOException{
        CharsetDetector detector = new CharsetDetector();
    
        // Use the declared character encoding, if available
        String encoding = metadata.get(Metadata.CONTENT_ENCODING);
        if (encoding != null) {
            detector.setDeclaredEncoding(encoding);
        }
    
        // CharsetDetector expects a stream to support marks
        if (!stream.markSupported()) {
            stream = new BufferedInputStream(stream);
        }
    
        detector.setText(stream);
    
        CharsetMatch match = detector.detect();
        if (match == null) {
            throw new TikaException("Unable to detect character encoding");
        }
        
        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
        String language = match.getLanguage();
        if (language != null) {
            metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
            metadata.set(Metadata.LANGUAGE, match.getLanguage());
        }
        
        return match.getReader();
    }


}
Source Code of org.apache.tika.utils.Utils

Related Classes of org.apache.tika.utils.Utils