Package com.findwise.hydra.stage.tika

Source Code of com.findwise.hydra.stage.tika.SimpleFetchingTikaStage

package com.findwise.hydra.stage.tika;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.Iterator;
import java.util.Map;

import com.findwise.utils.tika.InputStreamParser;
import com.findwise.utils.tika.ParsedData;
import org.apache.commons.codec.binary.Base64;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.findwise.hydra.local.LocalDocument;
import com.findwise.hydra.stage.AbstractProcessStage;
import com.findwise.hydra.stage.Parameter;
import com.findwise.hydra.stage.RequiredArgumentMissingException;
import com.findwise.hydra.stage.Stage;

/**
* Downloads content which should be parsed by Tika. A pattern is given which
* points out all the fields containing URLs to download and parsed.
* <p/>
* Notice that the stage will add a prefix to all parsed fields. The prefix has
* the following format FIRST_CAPTURING_CROUP + _ + PARSED_FIELD_NAME
* <p/>
* e.g. If urlFieldPattern is set to crawl_(url) all parsed fields from tika
* will start with url_.
* <p/>
* Sample configuration:
* {@code
* {
* "stageClass": "com.findwise.hydra.stage.tika.SimpleFetchingTikaStage",
* "query": {
* "equals":{"type": "file" },
* "touched":{"set-internal-type": true}
* },
* "urlFieldPattern" : "(url)"
* }
* }
*/
@Stage(description = "A stage that fetches the content from a given url and appends it the the document")
public class SimpleFetchingTikaStage extends AbstractProcessStage {
    private static Logger logger = LoggerFactory.getLogger(SimpleFetchingTikaStage.class);

    @Parameter(required = true, description = "The field name pattern that should be matched where " +
            "urls will be found. First group plus \"_\" will be used as field prefix. Example:" +
            " \"attachment_(.*)\" will match for example attachment_a and will use \"a_\" as prefix")
    public String urlFieldPattern = null;

    @Parameter(name = "addMetaData", description = "Add the metadata to the document or not. Defaults to true")
    public boolean addMetaData = true;

    @Parameter(description = "Set to true, will also do language detection and add the field 'prefix_language' according to the prefix rules. Defaults to true")
    public boolean addLanguage = true;

    @Parameter(description = "Username for basic authentication.")
    public String username = null;

    @Parameter(description = "Password for basic authentication.")
    public String password = null;

    private Parser parser = new AutoDetectParser();

    @Override
    public void process(LocalDocument doc) throws Exception {
        Map<String, Object> urls = FieldHelper.getFieldMatchingPattern(doc.getContentMap(),
                urlFieldPattern);
        UriParser uriParser = new UriParser();
        DocumentParserHelper documentParserHelper = new DocumentParserHelper(addMetaData, addLanguage);
        InputStreamParser inputStreamParser = new InputStreamParser(parser);
        for (String field : urls.keySet()) {
            Iterator<URL> it = uriParser.getUrlsFromObject(urls.get(field)).iterator();
            for (int i = 1; it.hasNext(); i++) {
                String num = (i > 1) ? "" + i : "";
                URL url = it.next();
                URLConnection connection = createConnection(url);
                final InputStream inputStream = connection.getInputStream();
                try {
                    String prefix = field + num + "_";
                    ParsedData parsedData = inputStreamParser.parse(inputStream);
                    documentParserHelper.addParsedDataToDocument(parsedData, doc, prefix);
                } finally {
                    inputStream.close();
                }
            }
        }
    }

    private URLConnection createConnection(URL url) throws IOException {
        URLConnection connection = url.openConnection();
        if (useBasicAuthentication()) {
            String authString = username + ":" + password;
            byte[] authEncBytes = Base64.encodeBase64(authString
                    .getBytes("UTF-8"));
            String authStringEnc = "Basic " + new String(authEncBytes, "UTF-8");
            connection.setRequestProperty("Authorization", authStringEnc);
        }
        return connection;
    }

    private boolean useBasicAuthentication() {
        return username != null && password != null;
    }

    @Override
    public void init() throws RequiredArgumentMissingException {
        if (urlFieldPattern == null) {
            throw new RequiredArgumentMissingException(
                    "Missing parameter urlFieldPattern");
        }
        logger.debug("Initiated SimpleTikaStage");
    }

  /* For testing purposes */

    void setUrlFieldPattern(String urlFieldPattern) {
        this.urlFieldPattern = urlFieldPattern;
    }

    void setParser(Parser parser) {
        this.parser = parser;
    }
}
TOP

Related Classes of com.findwise.hydra.stage.tika.SimpleFetchingTikaStage

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.