Package com.findwise.utils.tika

Source Code of com.findwise.utils.tika.ParsedData

package com.findwise.utils.tika;

import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

/**
* Container for data parsed by Tika
*/
public class ParsedData {

    private final String content;

    private final Metadata metadata;

    private final TextSanitizer textSanitizer;

    public ParsedData(String content, Metadata metadata) {
        this.content = content;
        this.metadata = metadata;
        this.textSanitizer = new TextSanitizer();
    }

    /**
     * @return sanitized content from the parsed data
     */
    public String getContent() {
        return textSanitizer.filterInvalidChars(content);
    }

    /**
     * @return a map with metadata fields, with sanitized field values
     */
    public Map<String, Object> getMetadata() {
        Map<String, Object> fields = new HashMap<String, Object>();
        for (String name : metadata.names()) {
            if (metadata.getValues(name).length > 1) {
                String[] metadataValues = metadata.getValues(name);
                fields.put(name, textSanitizer.filterInvalidChars(Arrays.asList(metadataValues)));
            } else {
                fields.put(name, textSanitizer.filterInvalidChars(metadata.get(name)));
            }
        }
        return fields;
    }

    /**
     * @return the identified language of the content
     */
    public String getLanguage() {
        return new LanguageIdentifier(content).getLanguage();
    }
}
TOP

Related Classes of com.findwise.utils.tika.ParsedData

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.