Package virtTripleLoaderInit

Source Code of virtTripleLoaderInit.VirtTripleLoaderInit$TrueFilter

/*
*  $Id$
*
*  This file is part of the OpenLink Software Virtuoso Open-Source (VOS)
*  project.
*
*  Copyright (C) 1998-2014 OpenLink Software
*
*  This project is free software; you can redistribute it and/or modify it
*  under the terms of the GNU General Public License as published by the
*  Free Software Foundation; only version 2 of the License, dated June 1991.
*
*  This program is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
*  General Public License for more details.
*
*  You should have received a copy of the GNU General Public License along
*  with this program; if not, write to the Free Software Foundation, Inc.,
*  51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/

//
// Java util to break down WARC files in nice pieces for insertion
//
//WARC-extract:
//for WARC file {
//  for each block {
//    write block in numbered file ldr_block_N.nt
//    INSERT entry for ldr_block to load_list in cluster
//    insert metadata from in mdGraph
//  }
//  write mdGraph into md_N.nt
//  insert md_N.nt into load_list in cluster
//

package virtTripleLoaderInit;

import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.MeasurableInputStream;
import it.unimi.dsi.law.warc.filters.Filter;
import it.unimi.dsi.law.warc.filters.Filters;
import it.unimi.dsi.law.warc.io.GZWarcRecord;
import it.unimi.dsi.law.warc.io.WarcFilteredIterator;
import it.unimi.dsi.law.warc.io.WarcRecord;
import it.unimi.dsi.law.warc.util.BURL;
import it.unimi.dsi.law.warc.util.WarcHttpResponse;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.*;
import java.util.*;

import javax.xml.datatype.DatatypeConfigurationException;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.XMLGregorianCalendar;

import org.openrdf.model.Graph;
import org.openrdf.model.Literal;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.ValueFactory;
import org.openrdf.rio.ntriples.NTriplesParser;
import org.openrdf.rio.helpers.RDFHandlerBase;
import org.openrdf.rio.RDFHandlerException;

import org.openrdf.rio.ntriples.NTriplesWriter;

import com.sun.org.apache.xalan.internal.xsltc.cmdline.getopt.GetOpt;

import virtuoso.jdbc3.*;




public class VirtTripleLoaderInit {

  private static final Boolean debugMode = false;
  private static final String myName = "Virtuoso Triple Loader Java";
  private static final Integer myMajorVer = 0;
  private static final Integer myMinorVer = 1;
  private static final Integer myBuild = 0;
  private static final String myCopyright = "Copyright (c) 2008-2014 OpenLink Software";

  private static final String virtConnectUrlDef = "jdbc:virtuoso://neo:1111";
  private static final String virtUserDef = "dba";
  private static final String virtPwdFileDef = "virt_secret";
  private static final String virtPwdDef = "dba";

  private String virtConnectUrl;
  private String virtUser;
  private String virtPwd;
  private String virtPwdFileName;

  // TODO: (ghard) Should be cmdline argument

  private static Connection virtConnection;
  private static PreparedStatement ldAddStmt;

  private static Logger l;
  private GetOpt g;
  private static WarcBlockWriter w;
  private static String curFile;


  private class WarcBlockWriter {
    private Integer fileNo = 0;
    private String fileFormat = "data%01d/triples_%07d.nt";
    private String curFile;
    private FileOutputStream outStream;
    private byte[] buf = new byte[2048];
    private int bytesRead;
    private int bytesInFile;

    public String write(MeasurableInputStream _block, int cnt, int step) throws IOException {
      bytesInFile = 0;
      curFile = String.format(fileFormat, step, cnt);
      outStream = new FileOutputStream(curFile);

      int in_url = 0;
      int c = 0;
      while ((c = _block.read()) != -1) {

        bytesInFile ++;
        outStream.write(c);

      }

      l.output("Wrote " + bytesInFile + " in " + curFile);

      outStream.flush();
      outStream.close();
      _block.close();
      return (curFile);

    }
  }

  public VirtTripleLoaderInit() {
    l = new Logger();
    w = new WarcBlockWriter();

    virtConnectUrl = System.getenv("JDBC_DS");
    virtUser = System.getenv("VIRT_USER");
    virtPwdFileName = System.getenv("VIRT_SECRET");

    if (virtConnectUrl == "")
      virtConnectUrl = virtConnectUrlDef;

    try {
      FileInputStream pwdIn = new FileInputStream (virtPwdFileName);
    }
    catch (FileNotFoundException e) {
      l.output("Using default password for virtuoso.");
      virtPwd = virtPwdDef;
    }

    try {
      Class.forName("virtuoso.jdbc3.Driver");

//       TODO: Use file to store secret. Check file perms and refuse to work if not restrictive enough

      virtConnection =
        DriverManager.getConnection(virtConnectUrl,virtUser,virtPwd);

      ldAddStmt = virtConnection.prepareStatement("ld_add (?,?)");

    }
    catch (SQLException e) {
      l.error(e);
      l.error("WX0001: Cannot obtain connection to Virtuoso Database. Exiting.");
      System.exit(-1);
    }
    catch (Exception e) {
        e.printStackTrace();
        System.exit (-1);
    }
  }

  private static void printUsage () {
    System.out.printf("%s %d.%d.%04d\n", myName, myMajorVer, myMinorVer, myBuild);
    System.out.println(myCopyright);
  }

  public static class TrueFilter extends Filter<BURL> {
    @Override
    public boolean accept(BURL x) {
      return true;
    }

    @Override
    public String toExternalForm() {
      return "true";
    }
  }

  /**
   * @param args
   * @throws FileNotFoundException
   * @author ghard
   */

  public static void main(String[] args) throws FileNotFoundException {

      VirtTripleLoaderInit vtl = new VirtTripleLoaderInit ();

      if (args.length < 1 || args.length > 2) {
    printUsage();
    l.error("WX0002: Invalid number of arguments. Exiting.");
    System.exit(-1);
      }

      if (args.length == 2) {

      }

      String inFile = args[0];
      int start = Integer.parseInt (args[1]);
      int step = 1000000;
      int min = start * step;
      int max = min + step;

      System.out.println (start);
      l.output("START " + inFile);

      final FastBufferedInputStream in =
    new FastBufferedInputStream(new FileInputStream (new File(inFile)));

      GZWarcRecord record = new GZWarcRecord();
      Filter<WarcRecord> filter = Filters.adaptFilterBURL2WarcRecord (new TrueFilter());
      WarcFilteredIterator it = new WarcFilteredIterator(in, record, filter);

      WarcHttpResponse response = new WarcHttpResponse();

      Graph mdGraph = new org.openrdf.model.impl.GraphImpl();
      String mdGraphURI = "http://challenge.semanticweb.org/2008/metadata";
      ValueFactory vf = mdGraph.getValueFactory();
      String dcNS = "http://purl.org/dc/elements/1.1/";
      DatatypeFactory dtf = null;

      try {
    dtf = DatatypeFactory.newInstance();
      } catch (DatatypeConfigurationException e1) {
    // TODO Auto-generated catch block
    e1.printStackTrace();
      }

      GregorianCalendar c = new GregorianCalendar ();

      try {
    int cnt = 0;

    //      while (cnt < 10 && it.hasNext()) {
    while (it.hasNext()) {

        WarcRecord nextRecord = it.next();

        //Get the HttpResponse
        try {
      response.fromWarcRecord (nextRecord);

      if (debugMode) {
          System.out.println("RECORD     : " + String.format("%05d", cnt));
          System.out.println(" subjectUri: " + nextRecord.header.subjectUri);
          System.out.println("contentType: " + nextRecord.header.contentType);
          System.out.println(" dataLength: " + nextRecord.header.dataLength);
          System.out.println("actual data: " + nextRecord.block.length());
          System.out.println("    missing: " +
            (nextRecord.header.dataLength - nextRecord.block.length()) + "b");
      }
      l.output(nextRecord.header.subjectUri.toString());

      URI s, p, o;
      Literal lit;
      if (cnt == max ){ return ; }
      if (cnt >= min && cnt < max) {

          s = vf.createURI(nextRecord.header.subjectUri.toString());
          p = vf.createURI(dcNS, "source");
          lit = vf.createLiteral(inFile);

          mdGraph.add(s,p,lit);

          c.setTime(nextRecord.header.creationDate);
          XMLGregorianCalendar xc = dtf.newXMLGregorianCalendar(c);

          p = vf.createURI(dcNS, "date");
          lit = vf.createLiteral(xc);

          mdGraph.add(s,p,lit);

          curFile = w.write(response.contentAsStream(), cnt, start);

          try {
        ldAddStmt.setString(1, curFile);
        ldAddStmt.setString(2, nextRecord.header.subjectUri.toString());
        ResultSet res = ldAddStmt.executeQuery();
          }
          catch (SQLException e) {
        l.error(e);
        l.error("WX0003: Insert to loader table failed. Exiting.");
        System.exit(-1);
          }
      }
        }
        catch (IOException e) {
      e.printStackTrace();
      continue;
        }
        cnt++;
    }
    }


      catch (RuntimeException re) {
    l.error ("WX0004 Unexpected Runtime Error Thrown.");
    l.error (re.toString());
    re.printStackTrace();
    System.exit (-1);
      }
      l.output ("Finish loop ");

      Iterator<Statement> iter = mdGraph.iterator();
      Integer _stmtcnt = 0;
      String mdFile = String.format("%s.md.nt", inFile);

      FileOutputStream outStream = new FileOutputStream(mdFile);
      NTriplesWriter ntw = new NTriplesWriter (outStream);

      ntw.handleNamespace ("dc", dcNS);

      try {
    ntw.startRDF();
    while (iter.hasNext()) {
        Statement _stmt = iter.next();
        ntw.handleStatement(_stmt);
        _stmtcnt++;
    }
    ntw.endRDF();
    outStream.close();
      }
      catch (Exception e) {
    e.printStackTrace();
    System.exit (-1);
      }

      l.output(_stmtcnt + " statements created in metadata graph " + mdGraphURI);
      }
  }
TOP

Related Classes of virtTripleLoaderInit.VirtTripleLoaderInit$TrueFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.