Package quickml.Utilities

Source Code of quickml.Utilities.CSVToInstanceReader

package quickml.Utilities;

import au.com.bytecode.opencsv.CSVReader;
import com.google.common.base.Optional;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import quickml.data.AttributesMap;
import quickml.data.Instance;
import quickml.data.InstanceImpl;

import java.io.FileReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import quickml.Utilities.Selectors.*;

/**
* Created by alexanderhawk on 10/2/14.
*/


/* This class converts the contents of a csv file into quickml instances.
   Defaults:
   1. the column containing the instance label is assumed to be the first collumn in the csv file.
   2. any variable that can be parsed to a number will be treated as numeric. Add an underscore to categorical variable values if they are numeric
   3. all instances are assumed to have equal weight

   Options:
   1. the column for an instances label can be specified by its name in the header in the function: columnNameForLabel.
   2. the column for an instances weight can be specified by its name in the header in the function: columnNameForWeight.
   2. One can specify which variables are categorical by providing either an instancet of a NumericSelector to numericSelector(), or
      a CategoricalSelector to categoricalSelector.  Only one of the two needs to be provided.
*/

public class CSVToInstanceReader {
    private List<String> header;
    private String columnNameForLabel;
    private String columnNameForWeight;
    private boolean containsUnLabeledInstances = false;
    private Optional<CategoricalSelector> categoricalSelector = Optional.absent();
    private Optional<NumericSelector> numericSelector = Optional.absent();
    private char delimiter = ',';

    public CSVToInstanceReader() {
    }

    public CSVToInstanceReader(char delimiter, String columnNameForLabel, String columnNameForWeight, Optional<CategoricalSelector> categoricalSelector,
                               Optional<NumericSelector> numericSelector) {
        this.delimiter = delimiter;
        this.columnNameForLabel = columnNameForLabel;
        this.columnNameForWeight = columnNameForWeight;
        this.categoricalSelector = categoricalSelector;
        this.numericSelector = numericSelector;
    }

    public ArrayList<Instance<AttributesMap>> readCsv(String fileName) throws Exception {

        CSVReader reader = new CSVReader(new FileReader(fileName), delimiter, '"');
        List<String[]> csvLines = reader.readAll();

        ArrayList<Instance<AttributesMap>> instances = Lists.newArrayList();
        try {
            header = new ArrayList<String>();
            Collections.addAll(header, csvLines.get(0));
            for (int i = 1; i < csvLines.size(); i++) {
                instances.add(instanceConverter(csvLines.get(i)));
            }
        } catch (Exception e) {
            throw new RuntimeException(e.getMessage());
        }
        return instances;
    }

    private Instance<AttributesMap> instanceConverter(String[] instanceArray) {

        AttributesMap attributesMap = AttributesMap.newHashMap();
        Serializable label = null;
        double weight = 1.0;
        for (int i = 0; i < header.size(); i++) {
            if (i >= instanceArray.length) {
                throw new IndexOutOfBoundsException();
            }

            if (instanceArray[i].isEmpty()) {
                continue;
            }

            boolean haveLabelInFirstCollumn = i == 0 && columnNameForLabel == null;
            boolean matchedCollumnToLabel = columnNameForLabel != null && columnNameForLabel.equals(header.get(i));
            if (haveLabelInFirstCollumn || matchedCollumnToLabel) {
                label = convertToNumberOrCleanedString(header.get(i), instanceArray[i]);
                continue;
            }

            boolean matchedCollumnToWeight = columnNameForWeight != null && columnNameForWeight.equals(header.get(i));
            if (matchedCollumnToWeight) {
                weight = (Double) convertToNumberOrCleanedString(header.get(i), instanceArray[i]);
                continue;
            }

            attributesMap.put(header.get(i), convertToNumberOrCleanedString(header.get(i), instanceArray[i]));
        }
        if (label == null) {
            label = "missing label";
            containsUnLabeledInstances = true;
        }

        return new InstanceImpl<AttributesMap>(attributesMap, label, weight);
    }

    private Serializable convertToNumberOrCleanedString(String varName, String varValue) {
        boolean categoricalOrNumericSelectorProvided = categoricalSelector.isPresent() || numericSelector.isPresent();
        if (!categoricalOrNumericSelectorProvided) {
                return tryToConvertToNumeric(varValue);
        } else {
            //note: quoted values will be treated as categorical unless a selector indicates otherwise
            if (categoricalSelector.isPresent() && categoricalSelector.get().isCategorical(varName)) {
                return categoricalSelector.get().cleanValue(varValue);
            } else if (!numericSelector.isPresent() || numericSelector.get().isNumeric(varName)) {
                if (numericSelector.isPresent()) {
                    varValue = numericSelector.get().cleanValue(varValue);
                }
                return tryToConvertToNumeric(varValue);
            } else {
                //now account for the case where a numeric selector is provided, but no categorical selector is.
                return varValue;
            }
        }
    }


    private Serializable tryToConvertToNumeric(String varValue) {
        try {
            return Long.valueOf(varValue);

        } catch (NumberFormatException e) {
            try {
                return Double.valueOf(varValue);
            } catch (NumberFormatException n) {
                return varValue;
            }
        }
    }


    public static void main(String[] args) {
        Set<String> catVariables = Sets.newHashSet();
        catVariables.add("eap");
        CSVToInstanceReaderBuilder csvReaderBuilder = new CSVToInstanceReaderBuilder().collumnNameForLabel("campaignId").categoricalSelector(new ExplicitCategoricalSelector(catVariables));
        CSVToInstanceReader csvReader = csvReaderBuilder.buildCsvReader();
        try {
            List<Instance<AttributesMap>> instances = csvReader.readCsv("test3");
            for (Instance<AttributesMap> instance : instances)
                System.out.println("label: " + instance.getLabel() + "attributes: " + instance.getAttributes().toString());

        } catch (Exception e)
        {
            throw new RuntimeException();
        }
      }
}
TOP

Related Classes of quickml.Utilities.CSVToInstanceReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.