Package org.data2semantics.exp.dmold

Source Code of org.data2semantics.exp.dmold.Task1Experiment

package org.data2semantics.exp.dmold;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.data2semantics.exp.RDFMLExperiment;
import org.data2semantics.exp.utils.Result;
import org.data2semantics.exp.utils.ResultsTable;
import org.data2semantics.proppred.kernels.KernelUtils;
import org.data2semantics.proppred.kernels.rdfgraphkernels.RDFFeatureVectorKernel;
import org.data2semantics.proppred.kernels.rdfgraphkernels.RDFIntersectionTreeEdgeVertexPathWithTextKernel;
import org.data2semantics.proppred.kernels.rdfgraphkernels.RDFWLSubTreeWithTextKernel;
import org.data2semantics.proppred.kernels.text.TextUtils;
import org.data2semantics.proppred.learners.Prediction;
import org.data2semantics.proppred.learners.SparseVector;
import org.data2semantics.proppred.learners.evaluation.EvaluationFunction;
import org.data2semantics.proppred.learners.evaluation.EvaluationUtils;
import org.data2semantics.proppred.learners.evaluation.MeanAbsoluteError;
import org.data2semantics.proppred.learners.evaluation.MeanSquaredError;
import org.data2semantics.proppred.learners.evaluation.Task1Score;
import org.data2semantics.proppred.learners.evaluation.Task1ScoreForBins;
import org.data2semantics.proppred.learners.evaluation.Task1ScoreForBothBins;
import org.data2semantics.proppred.learners.liblinear.LibLINEAR;
import org.data2semantics.proppred.learners.liblinear.LibLINEARParameters;
import org.data2semantics.proppred.learners.libsvm.LibSVM;
import org.data2semantics.tools.rdf.RDFFileDataSet;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.Value;
import org.openrdf.model.util.LiteralUtil;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.rio.RDFFormat;

public class Task1Experiment extends RDFMLExperiment {
  private static String dataDir = "C:\\Users\\Gerben\\Dropbox\\D2S\\Task1\\";

  public static void main(String[] args) {
    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-file")) {
        i++;
        dataDir = args[i];
      }
    }
   
    long seed = 1;
    createTask1DataSet(1, seed);

    //    double[] bins = {-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 7.5, 9.5, 14.5, 75.5};
    //double[] bins = {0.5, 1.5, 3.5, 6.5, 22.5};
    double[] bins = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 12.5, 15.5, 18.5, 23.5};


    long[] seeds = {11, 21, 31, 41, 51, 61, 71, 81, 91, 101};
    double[] cs = {0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000}

    int[] depths = {1,2,3};
    int[] iterations = {0,2,4,6};

    double[] ps1 = {1};
    double[] ps2 = {0.000001, 0.00001, 0.0001, 0.001, 0.01};


    List<Double> target = new ArrayList<Double>()
    List<Double> targetBins = new ArrayList<Double>()

    for (Value label : labels) {
      double val = LiteralUtil.getDoubleValue(label,0);
      target.add(val);

      for (int i=0; i < bins.length-1; i++) {
        if (val > bins[i] && val <= bins[i+1]) {
          targetBins.add(i+1.0);
        }
      }
    }


    ResultsTable resTable = new ResultsTable();
    resTable.setManWU(0.05);

    boolean inference = true;

    List<EvaluationFunction> evalFuncs1 = new ArrayList<EvaluationFunction>();
    evalFuncs1.add(new Task1ScoreForBins(bins));

    List<EvaluationFunction> evalFuncs2 = new ArrayList<EvaluationFunction>();
    evalFuncs2.add(new Task1Score());
    evalFuncs2.add(new MeanSquaredError());
    evalFuncs2.add(new MeanAbsoluteError());


    LibLINEARParameters linParms = new LibLINEARParameters(LibLINEARParameters.SVC_DUAL, cs);
    linParms.setEvalFunction(new Task1ScoreForBothBins(bins));
    linParms.setDoCrossValidation(false);
    linParms.setSplitFraction((float) 0.8);
    linParms.setEps(0.1);
    linParms.setPs(ps1);

    Map<Double, Double> counts = EvaluationUtils.computeClassCounts(targetBins);
    int[] wLabels = new int[counts.size()];
    double[] weights = new double[counts.size()];

    for (double label : counts.keySet()) {
      wLabels[(int) label - 1] = (int) label;
      weights[(int) label - 1] = 1 / counts.get(label);
    }
    linParms.setWeightLabels(wLabels);
    linParms.setWeights(weights);



    LibLINEARParameters linParms2 = new LibLINEARParameters(LibLINEARParameters.SVR_DUAL, cs);
    linParms2.setEvalFunction(new Task1Score());
    linParms2.setDoCrossValidation(false);
    linParms2.setSplitFraction((float) 0.8);
    linParms2.setEps(0.1);
    linParms2.setPs(ps2);
    linParms2.setBias(1);


    for (int d : depths) {     
      resTable.newRow("ITP BoW, depth="+d);

      RDFFeatureVectorKernel kernel = new RDFIntersectionTreeEdgeVertexPathWithTextKernel(d, false, inference, false);

      System.out.println("Running PathWithText kernel: " + d );

      Map<EvaluationFunction, double[]> resultMap = new HashMap<EvaluationFunction,double[]>();
      Map<EvaluationFunction, double[]> resultMap2 = new HashMap<EvaluationFunction,double[]>();

      List<Result> results = new ArrayList<Result>();

      for (EvaluationFunction evalFunc : evalFuncs1) {
        Result res = new Result();
        double[] resA = new double[seeds.length];
        res.setLabel(evalFunc.getLabel());
        res.setScores(resA);
        res.setHigherIsBetter(evalFunc.isHigherIsBetter());
        results.add(res);
        resultMap.put(evalFunc, resA);
      }

      for (EvaluationFunction evalFunc : evalFuncs2) {
        Result res = new Result();
        double[] resA = new double[seeds.length];
        res.setLabel(evalFunc.getLabel());
        res.setScores(resA);
        res.setHigherIsBetter(evalFunc.isHigherIsBetter());
        results.add(res);
        resultMap2.put(evalFunc, resA);
      }

      Result compR = new Result();
      results.add(compR);


      long tic, toc;

      List<Double> tempLabels = new ArrayList<Double>();
      List<Double> tempLabelsBins = new ArrayList<Double>();
      tempLabels.addAll(target);
      tempLabelsBins.addAll(targetBins);

      tic = System.currentTimeMillis();
      SparseVector[] fv = kernel.computeFeatureVectors(dataset, instances, blackList);
      toc = System.currentTimeMillis();

      fv = TextUtils.computeTFIDF(Arrays.asList(fv)).toArray(new SparseVector[1]);
      fv = KernelUtils.normalize(fv);


      List<SparseVector> fvList = Arrays.asList(fv);


      compR.setLabel("kernel comp time");

      for (int j = 0; j < seeds.length; j++) {
        Collections.shuffle(fvList, new Random(seeds[j]));
        Collections.shuffle(tempLabels, new Random(seeds[j]));
        Collections.shuffle(tempLabelsBins, new Random(seeds[j]))

        fv = fvList.toArray(new SparseVector[1]);
        double[] targetA = new double[tempLabels.size()];
        double[] targetABins = new double[tempLabelsBins.size()];
        for (int i = 0; i < targetA.length; i++) {
          targetA[i] = tempLabels.get(i);
          targetABins[i] = tempLabelsBins.get(i);
        }

        Prediction[] pred = LibLINEAR.trainTestSplit(fv, targetABins, linParms, linParms.getSplitFraction());     
        Prediction[] pred2 = LibLINEAR.trainTestSplit(fv, targetA, linParms2, linParms2.getSplitFraction());       

        double[] targetSplit = LibLINEAR.splitTestTarget(targetA, linParms.getSplitFraction());

        for (EvaluationFunction ef : evalFuncs1) {
          resultMap.get(ef)[j] = ef.computeScore(targetSplit, pred)
        }       
        for (EvaluationFunction ef : evalFuncs2) {
          resultMap2.get(ef)[j] = ef.computeScore(targetSplit, pred2)
        }
      }
      double[] comp = {toc - tic};
      compR.setScores(comp);

      for (Result res : results) {
        resTable.addResult(res);
      }     
    }


    for (int d : depths) {     
      for (int it : iterations) {
        resTable.newRow("RDF WL BoW, depth="+d);

        /*
        List<RDFFeatureVectorKernel> kernels = new ArrayList<RDFFeatureVectorKernel>();
        kernels.add(new RDFWLSubTreeKernel(it,d, inference, false));
        kernels.add(new RDFSimpleTextKernel(d, inference, false));
        RDFFeatureVectorKernel kernel = new RDFCombinedKernel(kernels, true);
        */
       
        RDFFeatureVectorKernel kernel = new RDFWLSubTreeWithTextKernel(it, d, inference, false);
       
       
       
        System.out.println("Running RDFWL + text kernel: " + d + " " + it);

        Map<EvaluationFunction, double[]> resultMap = new HashMap<EvaluationFunction,double[]>();
        Map<EvaluationFunction, double[]> resultMap2 = new HashMap<EvaluationFunction,double[]>();

        List<Result> results = new ArrayList<Result>();

        for (EvaluationFunction evalFunc : evalFuncs1) {
          Result res = new Result();
          double[] resA = new double[seeds.length];
          res.setLabel(evalFunc.getLabel());
          res.setScores(resA);
          res.setHigherIsBetter(evalFunc.isHigherIsBetter());
          results.add(res);
          resultMap.put(evalFunc, resA);
        }

        for (EvaluationFunction evalFunc : evalFuncs2) {
          Result res = new Result();
          double[] resA = new double[seeds.length];
          res.setLabel(evalFunc.getLabel());
          res.setScores(resA);
          res.setHigherIsBetter(evalFunc.isHigherIsBetter());
          results.add(res);
          resultMap2.put(evalFunc, resA);
        }

        Result compR = new Result();
        results.add(compR);


        long tic, toc;

        List<Double> tempLabels = new ArrayList<Double>();
        List<Double> tempLabelsBins = new ArrayList<Double>();
        tempLabels.addAll(target);
        tempLabelsBins.addAll(targetBins);

        tic = System.currentTimeMillis();
        SparseVector[] fv = kernel.computeFeatureVectors(dataset, instances, blackList);
        toc = System.currentTimeMillis();

        fv = TextUtils.computeTFIDF(Arrays.asList(fv)).toArray(new SparseVector[1]);
        fv = KernelUtils.normalize(fv);


        List<SparseVector> fvList = Arrays.asList(fv);


        compR.setLabel("kernel comp time");

        for (int j = 0; j < seeds.length; j++) {
          Collections.shuffle(fvList, new Random(seeds[j]));
          Collections.shuffle(tempLabels, new Random(seeds[j]));
          Collections.shuffle(tempLabelsBins, new Random(seeds[j]))

          fv = fvList.toArray(new SparseVector[1]);
          double[] targetA = new double[tempLabels.size()];
          double[] targetABins = new double[tempLabelsBins.size()];
          for (int i = 0; i < targetA.length; i++) {
            targetA[i] = tempLabels.get(i);
            targetABins[i] = tempLabelsBins.get(i);
          }

          Prediction[] pred = LibLINEAR.trainTestSplit(fv, targetABins, linParms, linParms.getSplitFraction());     
          Prediction[] pred2 = LibLINEAR.trainTestSplit(fv, targetA, linParms2, linParms2.getSplitFraction());       

          double[] targetSplit = LibLINEAR.splitTestTarget(targetA, linParms.getSplitFraction());

          for (EvaluationFunction ef : evalFuncs1) {
            resultMap.get(ef)[j] = ef.computeScore(targetSplit, pred)
          }       
          for (EvaluationFunction ef : evalFuncs2) {
            resultMap2.get(ef)[j] = ef.computeScore(targetSplit, pred2)
          }
        }
        double[] comp = {toc - tic};
        compR.setScores(comp);

        for (Result res : results) {
          resTable.addResult(res);
       
      }
    }



    saveResults(resTable, "task1_" + seed + ".ser");
    resTable.addCompResults(resTable.getBestResults());
    System.out.println(resTable);
    saveResults(resTable.toString(), "task1_" + seed + ".txt");



  }

  private static void createTask1DataSet(double fraction, long seed) {
    RDFFileDataSet d = new RDFFileDataSet(dataDir + "LDMC_Task1_train.ttl", RDFFormat.TURTLE);
 
    /*
    d.addFile("C:\\Users\\Gerben\\Dropbox\\D2S\\Task1\\adms.ttl", RDFFormat.TURTLE);
    d.addFile("C:\\Users\\Gerben\\Dropbox\\D2S\\Task1\\ns.ttl", RDFFormat.TURTLE);
    d.addFile("C:\\Users\\Gerben\\Dropbox\\D2S\\Task1\\skos.rdf", RDFFormat.RDFXML);
    d.addFile("C:\\Users\\Gerben\\Dropbox\\D2S\\Task1\\v1.owl", RDFFormat.RDFXML);
    d.addFile("C:\\Users\\Gerben\\Dropbox\\D2S\\Task1\\schemaorg.owl", RDFFormat.RDFXML);

    d.addFile("C:\\Users\\Gerben\\Dropbox\\D2S\\Task1\\lookup\\describeList\\all.ttl", RDFFormat.TURTLE);

    RDFMultiDataSet test = new RDFMultiDataSet();
    test.addRDFDataSet(d);
    List<String> dbpns = new ArrayList<String>();
    dbpns.add("http://dbpedia.org");
    RDFSparqlDataSet sds = new RDFSparqlDataSet("http://dbpedia.org/sparql", dbpns);
    sds.setLogFile("test.txt");
    //test.addRDFDataSet(sds);
   */

    dataset = d;

    Random rand = new Random(seed);



    List<Statement> stmts = dataset.getStatementsFromStrings(null, RDF.TYPE.toString(), "http://purl.org/procurement/public-contracts#Contract");
    instances = new ArrayList<Resource>();
    labels = new ArrayList<Value>();
    blackList = new ArrayList<Statement>();

    for(Statement stmt: stmts) {
      List<Statement> stmts2 = dataset.getStatementsFromStrings(stmt.getSubject().toString(), "http://purl.org/procurement/public-contracts#numberOfTenders", null);

      for (Statement stmt2 : stmts2) {

        if (rand.nextDouble() < fraction) {
          instances.add(stmt2.getSubject());
          labels.add(stmt2.getObject());
        }
      }
    }

    removeSmallClasses(5);
    createBlackList();

    double[] target = new double[labels.size()];
    for (int i = 0; i < target.length; i++) {
      target[i] = LiteralUtil.getDoubleValue(labels.get(i),0);
    }
    Map<Double, Double> cc = LibSVM.computeClassCounts(target);
    Double[] keys = cc.keySet().toArray(new Double[1]);
    Arrays.sort(keys);

    for (double key : keys) {
      System.out.println(key + " -> " + cc.get(key));
    }
  }


}
TOP

Related Classes of org.data2semantics.exp.dmold.Task1Experiment

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.