Package org.apache.stanbol.entityhub.indexing.core.source

Source Code of org.apache.stanbol.entityhub.indexing.core.source.LineBasedEntityIterator

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.entityhub.indexing.core.source;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.Map;
import java.util.NoSuchElementException;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.stanbol.entityhub.indexing.core.EntityIterator;
import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Implementation of the {@link EntityIterator} based on reading data line wise
* from an {@link InputStream}
* @author Rupert Westenthaler
*
*/
public class LineBasedEntityIterator implements EntityIterator {
    /**
     * Parameter used to configure the position of the entity score (starting
     * with 1)
     */
    public static final String PARAM_SCORE_POS = "score-pos";
    /**
     * Default position for the entity score=2
     */
    public static final int DEFAULT_SCORE_POS = 2;
    /**
     * Parameter used to configure the position of the entity id (starting
     * with 1)
     */
    public static final String PARAM_ID_POS = "id-pos";
    /**
     * Default position for the entity id=1
     */
    public static final int DEFAULT_ID_POS = 1;
    public static final String PARAM_ID_NAMESPACE = "id-namespace";


    private static final Logger log = LoggerFactory.getLogger(LineBasedEntityIterator.class);
   
    public static final String PARAM_SEPARATOR = "separator";
    /**
     * The default separator to split the entity id with the score "\t" (tab)
     */
    public static final String DEFAULT_SEPARATOR = "\t";
    /**
     * The default encoding used to read the data from the parsed {@link InputStream}
     * (UTF-8)
     */
    public static final String DEFAULT_ENCODING = "UTF-8";
    /**
     * Parameter used to configure the name of the source file within the
     * {@link IndexingConfig#getSourceFolder()}
     */
    public static final String PARAM_ENTITY_SCORE_FILE = "source";
    /**
     * The default name used for the {@link #PARAM_ENTITY_SCORE_FILE}
     */
    public static final String DEFAULT_ENTITY_SCORE_FILE = "entityScores.tsv";
    /**
     * Parameter used to configure if the Entity IDs should be {@link URLEncoder
     * URL encoded} (the default is {@value #DEFAULT_ENCODE_ENTITY_IDS})
     */
    public static final String PARAM_URL_ENCODE_ENTITY_IDS = "encodeIds";
    public static final String PARAM_URL_DECODE_ENTITY_IDS = "decodeIds";
    /**
     * Parameter used to configure the text encoding used by the source file
     * (the default is {@value #DEFAULT_ENCODING})
     */
    public static final String PARAM_CHARSET = "charset";
    /**
     * Parameter used to configure if {@link String#trim()} should be called
     * on lines.
     */
    public static final String PARAM_TRIM_LINE = "trimLine";
    /**
     * The default value for {@link #PARAM_TRIM_LINE} is <code>false</code>
     */
    public static final boolean DEFAULT_TRIM_LINE = false;
    /**
     * Parameter used to configure if {@link String#trim()} should be called
     * on the element used to parse the entity id.
     */
    public static final String PARAM_TRIM_ID = "trimEntity";
    /**
     * The default value vor {@link #DEFAULT_TRIM_ENTITY} is <code>true</code>
     */
    public static final boolean DEFAULT_TRIM_ENTITY = true;
   
    protected File scoreFile;
    protected BufferedReader reader;
    private String separator;
    private String charset;
    /**
     * Used to indicate if Entity IDs should be URL encoded/decoded<p>
     * <source><pre>
     * 0 ... no action
     * -1 ... decode
     * +1 ... encode
     * </pre></source>
     */
    private int encodeEntityIds;
    private long lineCounter = 0;
    private EntityScore nextEntity;
    private int scorePos;
    private int idPos;
    protected String namespace;
    private boolean trimLine;
    private boolean trimEntityId;
   
    /**
     * Default constructor relaying on {@link #setConfiguration(Map)} is used
     * to provide the configuration
     */
    public LineBasedEntityIterator(){
        this(null);
    }
    /**
     * Constructs an EntityScoreIterator that reads {@link EntityScore}s based
     * on lines provided by the parsed InputStream. <p> Separator, Charset and
     * encoding of Entity ids are initialised based on the default values.
     * @param is the InputStream to read the data from
     * @throws IOException On any error while initialising the {@link BufferedReader}
     * based on the parsed {@link InputStream}
     */
    public LineBasedEntityIterator(InputStream is) {
        this(is,null,null);
    }
    /**
     * Constructs an EntityScoreIterator based on the parsed parameters. The
     * default values are used if <code>null</code> is parsed for any parameter
     * other than the InputStream.
     * @param is the InputStream to read the data from
     * @param charset
     * @param separator
     * @throws IOException On any error while initialising the {@link BufferedReader}
     * based on the parsed {@link InputStream}
     * @throws IllegalArgumentException if <code>null</code> is parsed as InputStream
     */
    public LineBasedEntityIterator(InputStream is,String charset,String separator) throws IllegalArgumentException {
        if(charset == null){
            this.charset = DEFAULT_ENCODING;
        } else {
            this.charset = charset;
        }
        if(separator == null){
            this.separator = DEFAULT_SEPARATOR;
        } else {
            this.separator = separator;
        }
        if(is != null){
            initReader(is);
        }
        scorePos = DEFAULT_SCORE_POS;
        idPos = DEFAULT_ID_POS;
        trimEntityId = DEFAULT_TRIM_ENTITY;
        trimLine = DEFAULT_TRIM_LINE;
    }
    @Override
    public void setConfiguration(Map<String,Object> config) {
        IndexingConfig indexingConfig = (IndexingConfig)config.get(IndexingConfig.KEY_INDEXING_CONFIG);
        log.info("Configure {} :",getClass().getSimpleName());
        Object value = config.get(PARAM_CHARSET);
        if(value != null && value.toString() != null){
            this.charset = value.toString();
            log.info("Set charset to '{}'",charset);
        }
        //parse encode/decode EntityIDs
        value = config.get(PARAM_URL_ENCODE_ENTITY_IDS);
        boolean encodeIds;
        if(value != null){
            encodeIds = Boolean.parseBoolean(value.toString());
        } else if (config.containsKey(PARAM_URL_ENCODE_ENTITY_IDS)){
            encodeIds = true;
        } else {
            encodeIds = false;
        }
        value = config.get(PARAM_URL_DECODE_ENTITY_IDS);
        boolean decodeIds;
        if(value != null){
            decodeIds = Boolean.parseBoolean(value.toString());
        } else if (config.containsKey(PARAM_URL_DECODE_ENTITY_IDS)){
            decodeIds = true;
        } else {
            decodeIds = false;
        }
        if(encodeIds && decodeIds){
            throw new IllegalArgumentException(String.format(
                "One can not enable both Parameters '{}' and '{}'!",
                PARAM_URL_DECODE_ENTITY_IDS,PARAM_URL_DECODE_ENTITY_IDS));
        } else if(encodeIds){
            this.encodeEntityIds = 1;
            log.info("activate URL encoding of Entity IDs");
        } else if(decodeIds){
            this.encodeEntityIds = -1;
            log.info("activate URL decoding of Entity IDs");
        }
        value = config.get(PARAM_ENTITY_SCORE_FILE);
        if(reader == null){
            if(value == null || value.toString().isEmpty()){
                scoreFile = indexingConfig.getSourceFile(DEFAULT_ENTITY_SCORE_FILE);
            } else {
                scoreFile = indexingConfig.getSourceFile(value.toString());
            }
            log.info("Set Source File to '"+this.scoreFile+"'");
        } //else reader parsed in the constructor ... nothing todo
        //now done in the initialise() method
//        try {
//            initReader(new FileInputStream(scoreFile));
//        } catch (FileNotFoundException e) {
//            throw new IllegalArgumentException("The File with the entity scores "+scoreFile.getAbsolutePath()+" does not exist",e);
//        }
        value = config.get(PARAM_ID_POS);
        if(value != null){
            try {
                setIdPos(Integer.parseInt(value.toString()));
                log.info("Set Entity ID Position to '{}'",idPos);
            } catch (NumberFormatException e) {
                throw new IllegalArgumentException("Unable to parse the position of the entity id from "+value,e);
            }
        }
        value = config.get(PARAM_SCORE_POS);
        if(value != null){
            try {
                setScorePos(Integer.parseInt(value.toString()));
                log.info("Set Score Position to '{}'",scorePos);
            } catch (NumberFormatException e) {
                throw new IllegalArgumentException("Unable to parse the position of the entity score from "+value,e);
            }
        }
        if(idPos == scorePos){
            throw new IllegalArgumentException("The position of the ID and the Score " +
                "values MUST NOT be the same value "+ idPos+"! Use "+
                PARAM_ID_POS+"(default="+DEFAULT_ID_POS+") and "+
                PARAM_SCORE_POS+"(default="+DEFAULT_SCORE_POS+") to configure " +
                "other values than the defaults.");
        }
        value = config.get(PARAM_ID_NAMESPACE);
        if(value != null){
            this.namespace = StringEscapeUtils.unescapeJava(value.toString());
            log.info("Set Namespace to ''",namespace);
        }
        value = config.get(PARAM_SEPARATOR);
        if(value != null && !value.toString().isEmpty()){
            this.separator = value.toString();
            log.info("Set Separator to '{}'",separator);
        }
        value = config.get(PARAM_TRIM_LINE);
        if(value != null){
            trimLine = Boolean.parseBoolean(value.toString());
            log.info("Set Trim Line State to '{}'",trimLine);
        } else if(config.containsKey(PARAM_TRIM_LINE)){
            //also accept the key without value as TRUE
            trimLine = true;
            log.info("Set Trim Line State to '{}'",trimLine);
        }
        value = config.get(PARAM_TRIM_ID);
        if(value != null){
            trimEntityId = Boolean.parseBoolean(value.toString());
            log.info("Set Entity ID State to '{}'",trimEntityId);
        } else if(config.containsKey(PARAM_TRIM_ID)){
            //also accept the key without value as TRUE
            trimEntityId = true;
            log.info("Set Entity ID State to '{}'",trimEntityId);
        }
    }
    private void setIdPos(int idPos) {
        if(idPos <= 0){
            throw new IllegalArgumentException("The position of the entity id MUST BE >= 1");
        }
        this.idPos = idPos;
       
    }
    private void setScorePos(int scorePos) {
        if(scorePos <= 0){
            throw new IllegalArgumentException("The position of the entity score MUST BE >= 1");
        }
        this.scorePos = scorePos;
       
    }
    /**
     * used by the constructors and {@link #setConfiguration(Map)} to initialise
     * the reader based on the provided File/InputStream.
     * @param is the input stream
     * @param charset the charset
     */
    private void initReader(InputStream is) {
        try {
            reader = new BufferedReader(new InputStreamReader(is, this.charset));
        } catch (UnsupportedEncodingException e) {
            throw new IllegalArgumentException("The parsed encoding "+charset+" is not supported",e);
        }
    }
    @Override
    public boolean hasNext() {
        if(nextEntity == null){ //consumed or first or end reached
            nextEntity = getNext(); //search for the next
        }
        return nextEntity != null; //return the found state
    }

    @Override
    public EntityScore next() {
        EntityScore entity = nextEntity;
        nextEntity = null; //consume
        if(entity == null){ //no next element (e.g. hasNext was not called)
            entity = getNext(); //search for it now
        }
        if(entity == null){ //if not found
            throw new NoSuchElementException();
        }
        return entity;
    }
    protected EntityScore getNext(){
        String line;
        try {
            while((line = reader.readLine()) != null){
                lineCounter++;
                log.debug("> line = {}",line);
                EntityScore entity = parseEntityFormLine(line);
                if(entity != null){
                    return entity;
                }
            }
            return null; //no more elements!
        } catch (IOException e) {
           throw new IllegalStateException("Unable to read next EntityScore",e);
        }
    }
   /**
     * @param line
     * @return
     */
    protected EntityScore parseEntityFormLine(String line) {
        if(line == null){
            return null;
        }
        if(trimLine){
            line = line.trim();
        }
        String[] parts = line.split(separator);
        String entity;
        Float score;
        if(parts.length >=idPos){
            try {
                String value;
                if(trimEntityId){
                    value = parts[idPos-1].trim();
                } else {
                    value = parts[idPos-1];
                }
                String id;
                if(encodeEntityIds > 0){
                    id = URLEncoder.encode(value,"UTF-8");
                } else if(encodeEntityIds < 0){
                    id = URLDecoder.decode(value, "UTF-8");
                } else {
                    id = value;
                }
                id = StringEscapeUtils.unescapeJava(id);
                log.debug(" - id = {}",id);
                entity = namespace != null ? namespace+id : id;
                log.debug(" - entity = {}",entity);
            } catch (UnsupportedEncodingException e) {
                throw new IllegalStateException("Unable to URLEncode EntityIds",e);
            }
        } else {
            return null; //No id -> no entity ...
        }
        if(parts.length >=scorePos){
            try {
                score = Float.parseFloat(parts[scorePos-1]);
            } catch (NumberFormatException e) {
                log.warn(String.format("Unable to parse the score for " +
                    "Entity %s from value %s in line %s! Use NULL as score 0",
                    entity,parts[scorePos-1],lineCounter));
                score = null;
            }
        } else {
            log.debug("No score for Entity {} in line {}! Use NULL as score",parts[0],lineCounter);
            score = null;
        }
        log.debug(" - score = ",score);
        return new EntityScore(entity,score);
    }

    @Override
    public void remove() {
        throw new UnsupportedOperationException("Removal form the EnityScore list is not supported");
    }
    @Override
    public boolean needsInitialisation() {
        return reader == null;
    }
    @Override
    public void initialise() {
        try {
            initReader(new FileInputStream(scoreFile));
        } catch (FileNotFoundException e) {
           throw new IllegalStateException("The file with the Entity Scores is missing",e);
        }
    }
       
    @Override
    public void close(){
        if(reader != null){
            try {
                reader.close();
            }catch (IOException e) {
                //ignore
            }
        }
    }
   
}
TOP

Related Classes of org.apache.stanbol.entityhub.indexing.core.source.LineBasedEntityIterator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.