Package lupos.engine.indexconstruction

Source Code of lupos.engine.indexconstruction.RDF3XIndexConstruction$GenerateIDTriplesUsingStringSearch2

/**
* Copyright (c) 2013, Institute of Information Systems (Sven Groppe and contributors of LUPOSDATE), University of Luebeck
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
* following conditions are met:
*
*   - Redistributions of source code must retain the above copyright notice, this list of conditions and the following
*     disclaimer.
*   - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
*     following disclaimer in the documentation and/or other materials provided with the distribution.
*   - Neither the name of the University of Luebeck nor the names of its contributors may be used to endorse or promote
*     products derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
*
*/
package lupos.engine.indexconstruction;

import lupos.compression.Compression;
import lupos.datastructures.dbmergesortedds.DBMergeSortedBag;
import lupos.datastructures.dbmergesortedds.DBMergeSortedSetUsingTrie;
import lupos.datastructures.dbmergesortedds.DiskCollection;
import lupos.datastructures.dbmergesortedds.SortConfiguration;
import lupos.datastructures.items.Triple;
import lupos.datastructures.items.literal.*;
import lupos.datastructures.items.literal.codemap.StringIntegerMapJava;
import lupos.datastructures.paged_dbbptree.DBBPTree.Generator;
import lupos.datastructures.paged_dbbptree.node.nodedeserializer.StringIntegerNodeDeSerializer;
import lupos.datastructures.patriciatrie.TrieSet;
import lupos.datastructures.queryresult.SIPParallelIterator;
import lupos.datastructures.stringarray.StringArray;
import lupos.engine.evaluators.CommonCoreQueryEvaluator;
import lupos.engine.evaluators.RDF3XQueryEvaluator;
import lupos.engine.operators.index.Indices;
import lupos.engine.operators.index.Indices.DATA_STRUCT;
import lupos.engine.operators.index.adaptedRDF3X.RDF3XIndexScan.CollationOrder;
import lupos.engine.operators.index.adaptedRDF3X.SixIndices;
import lupos.engine.operators.tripleoperator.TripleConsumer;
import lupos.io.helper.OutHelper;
import lupos.misc.TimeInterval;
import lupos.misc.util.ImmutableIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;

/**
* This class constructs the RDF3X indices on disk using a dictionary, which is
* also constructed on disk...
*/
public class RDF3XIndexConstruction {

  private static final Logger log = LoggerFactory.getLogger(RDF3XIndexConstruction.class);

  private static final int k = 1000;
  private static final int k_ = 1000;

  public static long LIMIT_ELEMENTS_IN_TRIE = 50000000;

  public static void insertUsedStringRepresentations(final URILiteral u,
      final String dataFormat,
      final SortedSet<String> rdftermsRepresentations,
      final TripleConsumer tc) {
    rdftermsRepresentations.add(u.toString());
    try {
      CommonCoreQueryEvaluator.readTriples(dataFormat, u.openStream(), tc);
    } catch (final Exception e) {
      log.error(e.getMessage(), e);
    }
  }

  /**
   * Constructs the large-scale indices for RDF3X.
   * The command line arguments are
   * <datafile> <dataformat> <encoding> <NONE|BZIP2|HUFFMAN|GZIP> <directory for indices> [LIMIT_ELEMENTS_IN_MEMORY [<datafile2> [<datafile3> ...]]]
   * If you want to import more than one file you can use the additional parameters <datafilei>!
   *
   * @param args
   *            command line arguments
   */
  public static void main(final String[] args) {
    try {

      log.info("Starting program to construct an RDF3X Index for LUPOSDATE...");
      log.debug("[help is printed when using less than 5 command line arguments]");
      log.debug("_______________________________________________________________");

      if (args.length < 5) {
        log.error("Usage: java -Xmx768M lupos.engine.indexconstruction.RDF3XIndexConstruction <datafile> <dataformat> <encoding> <NONE|BZIP2|HUFFMAN|GZIP> <directory for indices> [LIMIT_ELEMENTS_IN_MEMORY [<datafile2> [<datafile3> ...]]]");
        log.error("Example: java -Xmx768M lupos.engine.indexconstruction.RDF3XIndexConstruction data.n3 N3 UTF-8 NONE /luposdateindex 500000");
        return;
      }

      final Date start = new Date();
      log.debug("Starting time: {}", start);

      LiteralFactory.setType(LiteralFactory.MapType.LAZYLITERALWITHOUTINITIALPREFIXCODEMAP);
      Indices.setUsedDatastructure(DATA_STRUCT.DBBPTREE);

      final String datafile = args[0];
      final String dataFormat = args[1];
      CommonCoreQueryEvaluator.encoding = args[2];

      final String compressor = args[3];
      if(compressor.compareTo("BZIP2")==0){
        SortConfiguration.setDEFAULT_COMPRESSION(Compression.BZIP2);
      } else if(compressor.compareTo("HUFFMAN")==0){
        SortConfiguration.setDEFAULT_COMPRESSION(Compression.HUFFMAN);
      } else if(compressor.compareTo("GZIP")==0){
        SortConfiguration.setDEFAULT_COMPRESSION(Compression.GZIP);
      } else {
        SortConfiguration.setDEFAULT_COMPRESSION(Compression.NONE);
      }

      final String[] dir = new String[] { args[4] };
      final String writeindexinfo = dir[0]+File.separator+RDF3XQueryEvaluator.INDICESINFOFILE;
      DBMergeSortedBag.setTmpDir(dir);
      DiskCollection.setTmpDir(dir);
      lupos.datastructures.paged_dbbptree.DBBPTree.setTmpDir(args[4],true);

      final Collection<URILiteral> defaultGraphs = new LinkedList<URILiteral>();
      final Collection<URILiteral> namedGraphs = new LinkedList<URILiteral>();
      defaultGraphs.add(LiteralFactory.createURILiteralWithoutLazyLiteral("<file:" + datafile+ ">"));

      if(args.length>5){
        LIMIT_ELEMENTS_IN_TRIE = Long.parseLong(args[5]);
        DBMergeSortedSetUsingTrie.LIMIT_ELEMENTS_IN_SET = LIMIT_ELEMENTS_IN_TRIE;
      }
      for(int i=6; i<args.length; i++){
        defaultGraphs.add(LiteralFactory.createURILiteralWithoutLazyLiteral("<file:" + args[i]+ ">"));
      }

      // Construct dictionary:

      final Thread codeMapConstructionThread = new Thread() {
        @Override
        public void run() {
            final DBMergeSortedSetUsingTrie rdftermsRepresentations = new DBMergeSortedSetUsingTrie(new SortConfiguration(), String.class);

            final TripleConsumer tc = new TripleConsumer() {

              @Override
              public void consume(final Triple triple) {
                for (final Literal l : triple) {
                  rdftermsRepresentations.add(l.toString());
                  if (l.originalStringDiffers()) {
                    rdftermsRepresentations.add(l.originalString());
                  }
                }
              }

            };
            for (final URILiteral u : defaultGraphs) {
              insertUsedStringRepresentations(u, dataFormat, rdftermsRepresentations, tc);
            }
            for (final URILiteral u : namedGraphs) {
              insertUsedStringRepresentations(u, dataFormat, rdftermsRepresentations, tc);
            }
            // now generate B+-tree for integer-string map and
            // string-integer
            // map of the codemap!
            final Generator<String, Integer> smsi = new Generator<String, Integer>() {

              @Override
              public Iterator<java.util.Map.Entry<String, Integer>> iterator() {
                return new ImmutableIterator<java.util.Map.Entry<String, Integer>>() {
                  Iterator<String> it = rdftermsRepresentations.iterator();
                  int index = 1;

                  @Override
                  public boolean hasNext() {
                    return this.it.hasNext();
                  }

                  @Override
                  public java.util.Map.Entry<String, Integer> next() {
                    if (!this.it.hasNext()) {
                      return null;
                    } else {
                      return new java.util.Map.Entry<String, Integer>() {
                        String s = it.next();
                        int localIndex = index++;

                        @Override
                        public String getKey() {
                          return this.s;
                        }

                        @Override
                        public Integer getValue() {
                          return this.localIndex;
                        }

                        @Override
                        public Integer setValue(
                            final Integer arg0) {
                          throw new UnsupportedOperationException();
                        }

                      };
                    }
                  }
                };
              }

              @Override
              public int size() {
                return rdftermsRepresentations.size();
              }

            };

            rdftermsRepresentations.sort();

            final Thread thread0 = new Thread() {
              @Override
              public void run() {
                lupos.datastructures.paged_dbbptree.DBBPTree<String, Integer> simap;
                try {
                  simap = new lupos.datastructures.paged_dbbptree.DBBPTree<String, Integer>(
                      k,
                      k_,
                      new StringIntegerNodeDeSerializer());
                  simap.generateDBBPTree(smsi);
                  LazyLiteral.setHm(new StringIntegerMapJava(
                      simap));
                } catch (final IOException e) {
                  log.error(e.getMessage(), e);
                }
              }
            };
            final Thread thread1 = new Thread() {
              @Override
              public void run() {
                StringArray ismap;
                try {
                  ismap = new StringArray();
                  ismap.generate(rdftermsRepresentations.iterator());
                  LazyLiteral.setV(ismap);
                } catch (final IOException e) {
                  log.error(e.getMessage(), e);
                }
              }
            };
            thread0.start();
            thread1.start();
            try {
              thread0.join();
              thread1.join();
            } catch (final InterruptedException e) {
              log.error(e.getMessage(), e);
            }
            rdftermsRepresentations.release();

        }
      };
      codeMapConstructionThread.start();

      try {
        codeMapConstructionThread.join();
      } catch (final InterruptedException e) {
        log.error(e.getMessage(), e);
      }

      final Date intermediate = new Date();
      final TimeInterval codemapInterval = new TimeInterval(start, intermediate);
      log.info("Codemap constructed in: {}", codemapInterval);
      log.info("Codemap contains {} entries!", LazyLiteral.getHm().size());

      // for debugging purposes:
//      final TripleConsumer interTripleConsumer = new TripleConsumer() {
//        public void consume(final Triple triple) {
//
//          // the generated codes using the dictionary:
//          System.out.println("(" + getCode(triple.getSubject())
//              + ", " + getCode(triple.getPredicate()) + ", "
//              + getCode(triple.getObject()) + ")");
//
//          indices.consume(triple);
//        }
//
//      };
//
//      new GenerateIDTriplesUsingStringSearch2(rdfURL, dataFormat,
//          interTripleConsumer);

      final Indices indices = new SixIndices(defaultGraphs.iterator().next());
      new GenerateIDTriplesUsingStringSearch2(defaultGraphs, dataFormat, indices);

      // write out index info

      final OutputStream out = new BufferedOutputStream(new FileOutputStream(writeindexinfo));
      indices.constructCompletely();

      OutHelper.writeLuposInt(lupos.datastructures.paged_dbbptree.DBBPTree.getCurrentFileID(), out);

      ((lupos.datastructures.paged_dbbptree.DBBPTree) ((StringIntegerMapJava) LazyLiteral.getHm()).getOriginalMap()).writeLuposObject(out);
      ((StringArray) LazyLiteral.getV()).writeLuposStringArray(out);
      OutHelper.writeLuposInt(1, out);
      LiteralFactory.writeLuposLiteral(defaultGraphs.iterator().next(), out);
      indices.writeIndexInfo(out);
      OutHelper.writeLuposInt(0, out);
      out.close();
      final Date end = new Date();
      log.debug("_______________________________________________________________");
      log.info("Done, RDF3X index constructed!");
      log.debug("End time: {}", end);

      log.debug("Used time: {}", new TimeInterval(start, end));
      log.debug("Number of imported triples: {}", ((SixIndices)indices).getIndex(CollationOrder.SPO).size());
    } catch (final Exception e) {
      log.error(e.getMessage(), e);
    }
  }

  private static String getCode(final Literal literal) {
    if (literal instanceof LazyLiteral) {
      String result = "" + ((LazyLiteral) literal).getCode();
      if (literal instanceof LazyLiteralOriginalContent) {
        result += "(code original content:"
          + ((LazyLiteralOriginalContent) literal)
          .getCodeOriginalContent() + ")";
      }
      return result;
    } else {
      return "Error - no lazy literal";
    }
  }

  public static class GenerateIDTriplesUsingStringSearch2 {

    public GenerateIDTriplesUsingStringSearch2(
        final Collection<URILiteral> graphURIs, final String dataFormat,
        final TripleConsumer tc) throws Exception {

      final TrieSet searchtree = TrieSet.createRamBasedTrieSet();

      final DiskCollection<Triple> triples = new DiskCollection<Triple>(Triple.class);

      try {

        final TripleConsumer tripleConsumer=new TripleConsumer() {

          @Override
          public void consume(final Triple triple) {
            for (final Literal l : triple) {
              searchtree.add(l.toString());
              if (l.originalStringDiffers()) {
                searchtree.add(l.originalString());
              }
            }
            triples.add(triple);
            if (searchtree.size()>LIMIT_ELEMENTS_IN_TRIE) {
              GenerateIDTriplesUsingStringSearch2.this.handleRun(searchtree, triples, tc);
            }
          }

        };

        for(final URILiteral graphURI:graphURIs){
          CommonCoreQueryEvaluator.readTriples(dataFormat,
              graphURI.openStream(), tripleConsumer);
        }
        if (searchtree.size() > 0) {
          this.handleRun(searchtree, triples, tc);
          triples.release();
        }
      } catch (final IOException e) {
        log.error(e.getMessage(), e);
      }
    }

    private void handleRun(final TrieSet searchtree,
        final Collection<Triple> triples, final TripleConsumer tc) {
      final int[] map = this.getMap(searchtree);

      for (final Triple triple : triples) {
        final Triple dummy = new Triple(triple.getPos(0),
            triple.getPos(1), triple.getPos(2));
        for (int pos = 0; pos < 3; pos++) {
          if (triple.getPos(pos).originalStringDiffers()) {
            dummy.setPos(
                pos,
                new LazyLiteralOriginalContent(
                    map[searchtree.getIndex(triple.getPos(
                        pos).toString())],
                        map[searchtree.getIndex(triple.getPos(
                            pos).originalString())]));
          } else {
            dummy.setPos(
                pos,
                new LazyLiteral(map[searchtree.getIndex(triple
                    .getPos(pos).toString())]));
          }
        }
        tc.consume(dummy);
      }

      // clear the searchtree and the triples collection for the next
      // "Run"
      searchtree.clear();
      triples.clear();
    }

    private int[] getMap(final TrieSet searchtree) {
      // build map from local dictionary to global
      // dictionary...
      final int[] map = new int[searchtree.size()];

      // get global map:
      final Iterator<java.util.Map.Entry<String, Integer>> iterator = ((StringIntegerMapJava) LazyLiteral
          .getHm()).getMap().entrySet().iterator();
      java.util.Map.Entry<String, Integer> current = iterator.next();

      int index = 0;
      for (final String s : searchtree) {
        if (iterator instanceof SIPParallelIterator) {
          while (s.compareTo(current.getKey()) != 0) {
            current = ((SIPParallelIterator<java.util.Map.Entry<String, Integer>, String>) iterator)
            .next(s);
          }
        } else {
          while (s.compareTo(current.getKey()) != 0) {
            current = iterator.next();
          }
        }
        map[index++] = current.getValue();
      }
      return map;
    }
  }
}
TOP

Related Classes of lupos.engine.indexconstruction.RDF3XIndexConstruction$GenerateIDTriplesUsingStringSearch2

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.