Package no.priv.garshol.duke.genetic

Source Code of no.priv.garshol.duke.genetic.GeneticAlgorithm$Filter

package no.priv.garshol.duke.genetic;

import java.util.Map;
import java.util.List;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Collection;
import java.util.Collections;
import java.io.IOException;

import no.priv.garshol.duke.Link;
import no.priv.garshol.duke.Record;
import no.priv.garshol.duke.Database;
import no.priv.garshol.duke.LinkKind;
import no.priv.garshol.duke.Property;
import no.priv.garshol.duke.Processor;
import no.priv.garshol.duke.LinkStatus;
import no.priv.garshol.duke.DataSource;
import no.priv.garshol.duke.LinkDatabase;
import no.priv.garshol.duke.ConfigWriter;
import no.priv.garshol.duke.Configuration;
import no.priv.garshol.duke.RecordIterator;
import no.priv.garshol.duke.DukeConfigException;
import no.priv.garshol.duke.InMemoryLinkDatabase;
import no.priv.garshol.duke.utils.LinkDatabaseUtils;
import no.priv.garshol.duke.matchers.MatchListener;
import no.priv.garshol.duke.matchers.TestFileListener;
import no.priv.garshol.duke.matchers.PrintMatchListener;

/**
* The class that actually runs the genetic algorithm.
*/
public class GeneticAlgorithm {
  private Configuration config;
  private GeneticPopulation population;
  private Database database;
  private Map<String, Record> secondary; // used in record linkage mode
  private InMemoryLinkDatabase testdb;
  private double best; // best ever
  private boolean active; // true iff we are using active learning
  private boolean scientific;
  private Oracle oracle;
  private String outfile; // file to write config to
  private Map<GeneticConfiguration, Double> sciencetracker;
  private boolean quiet; // limit output
  private boolean incomplete; // is test file incomplete?

  private int threads; // parallel threads to run
  private int generations;
  private int questions; // number of questions to ask per iteration
  private boolean sparse; // whether to skip asking questions after some gens
  private int skipgens; // number of generations left to skip
  private int asked; // number of questions asked

  private Collection<Pair> used; // all the pairs we've ever asked about

  /**
   * Creates the algorithm.
   * @param testfile Test file to evaluate configs against. If null
   *                 we use active learning instead.
   * @param scientific A mode used for testing. Set to false.
   */
  public GeneticAlgorithm(Configuration config, String testfile,
                          boolean scientific)
    throws IOException {
    this.config = config;
    this.population = new GeneticPopulation(config);
    this.generations = 100;
    this.questions = 10;
    this.testdb = new InMemoryLinkDatabase();
    //testdb.setDoInference(true);
    this.scientific = scientific;
    this.threads = 1;
    this.used = new ArrayList();

    if (!scientific) {
      this.oracle = new ConsoleOracle();
      if (testfile != null)
        LinkDatabaseUtils.loadTestFile(testfile, testdb);
      else
        active = true;
    } else {
      // in scientific mode we simulate active learning by pretending
      // not to have a test file, but answering all questions from the
      // test file. this allows us to evaluate how well the active
      // learning approach actually works.
      active = true;
      this.oracle = new LinkFileOracle(testfile);
      this.sciencetracker = Collections.synchronizedMap(new HashMap());
    }
  }

  /**
   * Sets the number of generations to run the algorithm for. Default
   * 100.
   */
  public void setGenerations(int generations) {
    this.generations = generations;
  }

  /**
   * Sets the size of the population. Default 100.
   */
  public void setPopulation(int population) {
    this.population.setSize(population);
  }

  /**
   * Sets the number of questions to ask per generation in active
   * learning mode. Default 10.
   */
  public void setQuestions(int questions) {
    this.questions = questions;
  }

  /**
   * Set the file to write the best configuration to. The
   * configuration gets written at the end of each generation.
   */
  public void setConfigOutput(String output) {
    this.outfile = output;
  }

  /**
   * Sets the number of threads to run the genetic algorithm in.
   */
  public void setThreads(int threads) {
    this.threads = threads;
  }

  public void setActive(boolean active) {
    // basically, if we have a link file, and call this method, what
    // it means is that we'll evaluate in optimistic mode. that is, we
    // assume that there are correct matches that don't exist in the
    // test file
    this.active = active;
  }

  public void setSparse(boolean sparse) {
    this.sparse = sparse;
  }

  /**
   * Tells the genetic algorithm not to output more than necessary.
   */
  public void setQuiet(boolean quiet) {
    this.quiet = quiet;
  }

  /**
   * Sets the file to write user's answers to in active learning mode.
   */
  public void setLinkFile(String linkfile) throws IOException {
    if (scientific || !active || oracle instanceof LinkFileOracle)
      throw new DukeConfigException("Have no use for link file");

    ((ConsoleOracle) oracle).setLinkFile(linkfile);
  }

  /**
   * Sets the number of mutations to perform on each new configuration
   * for each generation. If not set, the algorithm will evolve a
   * mutation rate.
   */
  public void setMutationRate(int mutation_rate) {
    population.setMutationRate(mutation_rate);
  }

  /**
   * Sets the number of recombinations to perform on each new
   * configuration for each generation. 0.75 means there's a 75%
   * chance we do one recombination. 1.75 means we do one for certain,
   * and, with 75% probability do another.
   */
  public void setRecombinationRate(double recombination_rate) {
    population.setRecombinationRate(recombination_rate);
  }

  /**
   * If true, the algorithm will not evolve the comparators, but only
   * the other aspects of the configuration. The default is to evolve
   * comparators, too.
   */
  public void setEvolveComparators(boolean evolve_comparators) {
    population.setEvolveComparators(evolve_comparators);
  }

  /**
   * Sets how many copies of the original configuration to keep in the
   * first generation. The default is 0, meaning the first generation
   * will be entirely random, but with this option you can make the
   * genetic algorithm start from your existing configuration.
   */
  public void setCopiesOfOriginal(int copies) {
    population.setCopiesOfOriginal(copies);
  }

  /**
   * Tells the algorithm whether to assume the test file contains all
   * correct pairs.
   */
  public void setIncompleteTest(boolean incomplete) {
    this.incomplete = incomplete;
  }

  /**
   * Actually runs the genetic algorithm.
   */
  public void run() {
    // first index up all records
    Collection<DataSource> sources;
    if (config.isDeduplicationMode())
      sources = config.getDataSources();
    else
      sources = config.getDataSources(1);

    database = config.getDatabase(true);
    for (DataSource src : sources) {
      RecordIterator it = src.getRecords();
      while (it.hasNext())
        database.index(it.next());
    }
    database.commit();

    // remember second set of records, too
    if (!config.isDeduplicationMode() && active) {
      // in record linkage mode we need to be able to look up records
      // in the second group, so that we can show them to the user
      // when asking questions about them
      secondary = new HashMap();
      for (DataSource src : config.getDataSources(2)) {
        RecordIterator it = src.getRecords();
        while (it.hasNext()) {
          Record r = it.next();
          secondary.put(getid(r), r);
        }
      }
    }

    // make first, random population
    population.create();

    // run through the required number of generations
    double prevbest = 0.0;
    int stuck_for = 0; // number of generations f has remained unchanged
    for (int gen = 0; gen < generations; gen++) {
      if (!quiet)
        System.out.println("===== GENERATION " + gen);
      double best = evolve(gen);
    }
  }

  /**
   * Creates a new generation.
   * @param gen_no The number of the generation. The first is 0.
   */
  public double evolve(int gen_no) {
    // evaluate current generation
    ExemplarsTracker tracker = null;
    if (active) {
      // the first time we try to find correct matches so that we're
      // guranteed the algorithm knows about *some* correct matches
      Comparator comparator = gen_no == 0 ?
        new FindCorrectComparator() : new DisagreementComparator();
      tracker = new ExemplarsTracker(config, comparator);
    }
    if (threads == 1)
      evaluateAll(tracker);
    else
      evaluateAllThreaded(tracker);

    population.sort();

    // compute some key statistics
    double fsum = 0.0;
    double lbest = -1.0;
    GeneticConfiguration best = null;
    List<GeneticConfiguration> pop = population.getConfigs();
    for (GeneticConfiguration cfg : pop) {
      fsum += cfg.getFNumber();
      if (cfg.getFNumber() > lbest) {
        lbest = cfg.getFNumber();
        best = cfg;
      }
    }
    if (!quiet) {
      System.out.println("BEST: " + lbest + " AVERAGE: " + (fsum / pop.size()));
      for (GeneticConfiguration cfg : pop)
        System.out.print(cfg.getFNumber() + " ");
      System.out.println();
    }

    // ask questions, if we're active
    if (active && skipgens == 0) {
      askQuestions(tracker);
      if (sparse) {
        if (gen_no > 9)
          skipgens = 3; // ask every fourth generation after 10th gen
        else if (gen_no > 1)
          skipgens = 1; // ask every second generation after the first two
      }
    } else if (skipgens > 0) // if we skipped asking, make note of that
      skipgens--;

    // in scientific mode, summarize true statistics for this generation
    if (scientific) {
      double devsum = 0.0;
      fsum = 0.0;
      lbest = -1.0;
      for (GeneticConfiguration cfg : pop) {
        double real = sciencetracker.get(cfg);
        devsum += Math.abs(cfg.getFNumber() - real);
        fsum += real;
        if (real > lbest)
          lbest = real;
      }

      if (!quiet) {
        System.out.println("ACTUAL BEST: " + sciencetracker.get(best) +
                           " ACTUAL AVERAGE: " + (fsum / pop.size()));
        System.out.println("AVERAGE DEVIATION: " + (devsum / pop.size()));
        System.out.println("QUESTIONS ASKED: " + used.size());
        System.out.println();
      }
      sciencetracker.clear();
    }

    // if asked to, write config
    if (outfile != null) {
      try {
        Configuration b = population.getBestConfiguration().getConfiguration();
        ConfigWriter.write(b, outfile);
      } catch (IOException e) {
        System.err.println("ERROR: Cannot write to '" + outfile + "': " + e);
      }
    }

    // is there any point in evolving?
    if (active &&
        population.getBestConfiguration().getFNumber() ==
        population.getWorstConfiguration().getFNumber())
      // all configurations rated equally, so we have no idea which
      // ones are best. leaving the population alone until we learn
      // more.
      return lbest;

    // produce next generation
    produceNextGeneration();
    return lbest;
  }

  private void produceNextGeneration() {
    // this code uses simple (mu, lambda) evolution. according to the
    // literature tournament selection should be better, but careful
    // experimentation revealed no measurable benefits whatever. the
    // tournament code has therefore been removed.

    List<GeneticConfiguration> pop = population.getConfigs();
    int size = pop.size();
    List<GeneticConfiguration> nextgen = new ArrayList(size);
    for (GeneticConfiguration cfg : pop.subList(0, (int) (size * 0.02)))
      nextgen.add(new GeneticConfiguration(cfg));
    for (GeneticConfiguration cfg : pop.subList(0, (int) (size * 0.03)))
      nextgen.add(new GeneticConfiguration(cfg));
    int start = (int) (size * 0.25);
    for (GeneticConfiguration cfg : pop.subList(0, start))
      nextgen.add(new GeneticConfiguration(cfg));
    for (GeneticConfiguration cfg : pop.subList(0, start))
      nextgen.add(new GeneticConfiguration(cfg));
    int remaining = pop.size() - nextgen.size(); // avoids rounding errors
    for (GeneticConfiguration cfg : pop.subList(start, start + remaining))
      nextgen.add(new GeneticConfiguration(cfg));

    if (nextgen.size() > size)
      nextgen = nextgen.subList(0, size);

    for (GeneticConfiguration cfg : nextgen) {
      double rr = cfg.getRecombinationRate();
      while (rr > Math.random()) {
        cfg.mateWith(population.pickRandomConfig());
        rr -= 1.0;
      }

      for (int ix = 0; ix < cfg.getMutationRate(); ix++)
        cfg.mutate();
    }

    population.setNewGeneration(nextgen);
  }

  private void evaluateAll(ExemplarsTracker tracker) {
    List<GeneticConfiguration> pop = population.getConfigs();
    for (GeneticConfiguration cfg : pop) {
      if (!quiet)
        System.out.println(cfg);
      double f = evaluate(cfg, tracker);
      if (!quiet)
        System.out.print("  " + f);
      if (f > best) {
        if (!quiet)
          System.out.println("\nNEW BEST!\n");
        best = f;
      }
      if (!quiet) {
        if (scientific)
          System.out.println("  (actual: " + sciencetracker.get(cfg) + ")");
        else
          System.out.println();
      }
    }
  }

  private void evaluateAllThreaded(ExemplarsTracker tracker) {
    WorkManager mgr = new WorkManager(population.getConfigs());

    // start threads
    WorkerThread[] workers = new WorkerThread[threads];
    for (int ix = 0; ix < threads; ix++) {
      workers[ix] = new WorkerThread(tracker, mgr, ix);
      workers[ix].start();
    }

    // wait for threads to finish
    try {
      for (int ix = 0; ix < workers.length; ix++)
        workers[ix].join();
    } catch (InterruptedException e) {
      // argh
    }
  }

  /**
   * Evaluates the given configuration, storing the score on the object.
   * @param config The configuration to evaluate.
   * @param listener A match listener to register on the processor. Can
   *                 be null.
   * @return The F-number of the configuration.
   */
  private double evaluate(GeneticConfiguration config,
                          MatchListener listener) {
    Configuration cconfig = config.getConfiguration();
    Processor proc = new Processor(cconfig, database);
    TestFileListener eval = makeEval(cconfig, testdb, proc);

    if (active || incomplete)
      // in active learning the test file is incomplete, so F-number eval
      // should be optimistic. similarly if the test file is known to be
      // incomplete, for whatever reason
      eval.setPessimistic(false);

    proc.addMatchListener(eval);
    TestFileListener seval = null;
    if (scientific) {
      seval = makeEval(cconfig, ((LinkFileOracle) oracle).getLinkDatabase(),
                       proc);
      seval.setPessimistic(true);
      proc.addMatchListener(seval);
    }
    if (listener != null)
      proc.addMatchListener(listener);
    if (cconfig.isDeduplicationMode())
      proc.linkRecords(cconfig.getDataSources());
    else
      proc.linkRecords(cconfig.getDataSources(2), false);

    if (seval != null)
      sciencetracker.put(config, seval.getFNumber());

    config.setFNumber(eval.getFNumber());
    return eval.getFNumber();
  }

  private TestFileListener makeEval(Configuration cfg, LinkDatabase testdb,
                                    Processor proc) {
    TestFileListener eval = new TestFileListener(testdb, cfg, false,
                                                 proc, false, false);
    eval.setQuiet(true);
    return eval;
  }

  /**
   * Returns the best configuration we've seen so far.
   */
  public GeneticConfiguration getBestConfiguration() {
    return population.getBestConfiguration();
  }

  /**
   * Returns the current population.
   */
  public GeneticPopulation getPopulation() {
    return population;
  }

  private void askQuestions(ExemplarsTracker tracker) {
    int count = 0;
    Filter f = new Filter(tracker.getExemplars());
    while (true) {
      Pair pair = f.getNext();
      if (pair == null)
        break;
      Record r1 = database.findRecordById(pair.id1);
      if (r1 == null)
        r1 = secondary.get(pair.id1);
      Record r2 = database.findRecordById(pair.id2);

      System.out.println();
      PrintMatchListener.prettyCompare(r1, r2, (double) pair.counter,
                                       "Possible match",
                                       config.getProperties());

      LinkKind kind = oracle.getLinkKind(pair.id1, pair.id2);
      Link link = new Link(pair.id1, pair.id2, LinkStatus.ASSERTED, kind, 1.0);
      testdb.assertLink(link);

      count++;
      if (count == questions)
        break;
    }
    asked += count;
  }

  private String getid(Record r) {
    for (String propname : r.getProperties()) {
      Property prop = config.getPropertyByName(propname);
      if (prop == null)
        throw new DukeConfigException("Record has property " + propname +
                                      " which is not in configuration");

      if (prop.isIdProperty())
        return r.getValue(propname);
    }
    return null;
  }

  // ----- FILTER

  // this filter is used to weed out questions that are duplicates of
  // questions already asked, but duplicates in a way that's difficult
  // to detect (hence all the code). what it does is explained here:
  // http://www.garshol.priv.no/blog/273.html

  class Filter {
    private List<Pair> exemplars;

    public Filter(List<Pair> exemplars) {
      this.exemplars = exemplars;
      applyFilter();
    }

    public Pair getNext() {
      if (exemplars.isEmpty())
        return null;

      // find the candidate pair with the lowest similarity score with
      // already used pairs
      double bestscore = 2.0;
      Pair thebest = exemplars.get(0); // just in case
      for (Pair candidate : exemplars) {
        if (testdb.inferLink(candidate.id1, candidate.id2) != null)
          continue; // we already know the answer
        double worst = 0.0;

        for (Pair seen : used) {
          double score = compare(candidate, seen);
          if (score > worst)
            worst = score;
        }

        if (worst < bestscore) {
          bestscore = worst;
          thebest = candidate;
        }
      }

      // now we know which one to return
      used.add(thebest);
      exemplars.remove(thebest);
      return thebest;
    }

    // find the n*2 best
    private void applyFilter() {
      List<Pair> chosen = new ArrayList();
      for (int next = 0; chosen.size() < questions * 2 &&
                         next < exemplars.size(); next++) {
        Pair pair = exemplars.get(next);
        if (testdb.inferLink(pair.id1, pair.id2) != null)
          continue; // we already know the answer
        pair.believers = whoThinksThisIsTrue(pair.id1, pair.id2);
        chosen.add(pair);
      }

      exemplars = chosen;
    }

    // we use Jaccard index, which is size of intersection divided by
    // size of union
    private double compare(Pair p1, Pair p2) {
      int intersection = 0;
      int union = 0;
      for (int ix = 0; ix < p1.believers.length; ix++) {
        if (p1.believers[ix] && p2.believers[ix])
          intersection++;
        if (p1.believers[ix] || p2.believers[ix])
          union++;
      }
      return ((double) intersection) / ((double) union);
    }

    private boolean[] whoThinksThisIsTrue(String id1, String id2) {
      Record r1 = database.findRecordById(id1);
      if (r1 == null)
        r1 = secondary.get(id1);
      Record r2 = database.findRecordById(id2);
      if (r2 == null)
        r2 = secondary.get(id2);

      List<GeneticConfiguration> configs = population.getConfigs();
      boolean[] believers = new boolean[configs.size()];
      for (int ix = 0; ix < configs.size(); ix++) {
        Configuration config = configs.get(ix).getConfiguration();
        Processor proc = new Processor(config, database);
        believers[ix] = proc.compare(r1, r2) > config.getThreshold();
      }
      return believers;
    }
  }

  // ----- COMPARATORS

  // this one tries to find correct matches
  static class FindCorrectComparator implements Comparator<Pair> {
    public int compare(Pair p1, Pair p2) {
      // puts the one with the highest count first
      return p2.counter - p1.counter;
    }
  }

  // this one tries to find the matches with the most information, by
  // picking the ones there is most disagreement on
  class DisagreementComparator implements Comparator<Pair> {
    public int compare(Pair p1, Pair p2) {
      int size = population.size();
      return getScore(p2) - getScore(p1);
    }

    private int getScore(Pair pair) {
      int size = population.size();
      return (size - pair.counter) * (size - (size - pair.counter));
    }
  }

  // ----- THREAD HANDLING

  class WorkManager {
    private List<GeneticConfiguration> pop;
    private int next;

    public WorkManager(List<GeneticConfiguration> pop) {
      this.pop = pop;
    }

    public synchronized GeneticConfiguration getNextConfig() {
      if (next < pop.size())
        return pop.get(next++);
      else
        return null;
    }

    public synchronized void evaluated(GeneticConfiguration cfg) {
      double f = cfg.getFNumber();

      if (!quiet) {
        System.out.println(cfg);
        System.out.print("  " + f);
      }
      if (f > best) {
        if (!quiet)
          System.out.println("\nNEW BEST!\n");
        best = f;
      }
      if (!quiet) {
        if (scientific)
          System.out.println("  (actual: " + sciencetracker.get(cfg) + ")");
        else
          System.out.println();
      }
    }
  }

  class WorkerThread extends Thread {
    private WorkManager mgr;
    private ExemplarsTracker tracker;

    public WorkerThread(ExemplarsTracker tracker, WorkManager mgr,
                        int threadno) {
      super("WorkerThread " + threadno);
      this.mgr = mgr;
      this.tracker = tracker;
    }

    public void run() {
      GeneticConfiguration cfg = mgr.getNextConfig();
      while (cfg != null) {
        evaluate(cfg, tracker);
        mgr.evaluated(cfg);
        cfg = mgr.getNextConfig();
      }
    }
  }
}
TOP

Related Classes of no.priv.garshol.duke.genetic.GeneticAlgorithm$Filter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.