Source Code of org.deri.eurostat.toc.ParseToC

package org.deri.eurostat.toc;


import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;


import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathFactory;


import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Options;
import org.deri.eurostat.zip.DownloadZip;
import org.deri.eurostat.zip.UnCompressXML;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


/**
 * Downloads the ToC.XML from EuroStat and extracts the Dataset URLs from it. Each of
 * the parsed URL is than sent to UnCompressXML class to uncompress the file and
 * RDFize the DSD and SDMX observations.
 * 
 * @author Aftab Iqbal
 *
 */
public class ParseToC {


  private Document xmlDocument;
  public ArrayList<String> lstDatasetURLs = new ArrayList<String>();
  public HashMap<String, HashMap<String,String>> toc = new HashMap<String, HashMap<String,String>>(); 
  private static int printDatasets = 10;
  UnCompressXML obj = new UnCompressXML();
  DownloadZip zip = new DownloadZip();
  
  public InputStream get_ToC_XMLStream()
  {
    InputStream is = null;
    try {
      URL url = new URL("http://epp.eurostat.ec.europa.eu/NavTree_prod/everybody/BulkDownloadListing?sort=1&file=table_of_contents.xml");
      HttpURLConnection conn = (HttpURLConnection)url.openConnection();
      is = conn.getInputStream();


      if (conn.getResponseCode() != 200) {
        System.err.println(conn.getResponseCode());
      }
    } catch (IOException e) {
      e.printStackTrace();
      return null;
    }
    
    return is;
  }
  
  public void initObjects(InputStream in){        
        try {
            xmlDocument = DocumentBuilderFactory.
      newInstance().newDocumentBuilder().
      parse(in);            
        } catch (IOException ex) {
            ex.printStackTrace();
        } catch (SAXException ex) {
            ex.printStackTrace();
        } catch (ParserConfigurationException ex) {
            ex.printStackTrace();
        }       
    }
  
  public void initObjects(String filePath){        
        try {
          xmlDocument = DocumentBuilderFactory.
      newInstance().newDocumentBuilder().
      parse(filePath);            
        } catch (IOException ex) {
            ex.printStackTrace();
        } catch (SAXException ex) {
            ex.printStackTrace();
        } catch (ParserConfigurationException ex) {
            ex.printStackTrace();
        }       
    }
  
  public void parseDataSets()
  {
    Element element = xmlDocument.getDocumentElement();
    
    NodeList nl = element.getElementsByTagName("nt:leaf");
    if(nl != null && nl.getLength() > 0)
    {
      for(int i = 0 ; i < nl.getLength();i++)
      {
        Element ele = (Element)nl.item(i);
        if(ele.getAttribute("type").equals("dataset") || ele.getAttribute("type").equals("table"))
        {
          getDatasetURLs(ele);
        }
      }
    }
    
  }


  public void printResults()
  {
    int count = 0;
    
    System.out.println("Total Datasets found in the ToC are : " + lstDatasetURLs.size());
    for(String str:lstDatasetURLs)
    {
      System.out.println(str);
      if(++count == printDatasets)
        break;
    }
  }
  
  // This piece of code will parse the compressed file URLs sequentially.
  public void parseXMLFiles(String downLoadPath)
  {
    for(String str:lstDatasetURLs)
    {
      obj.parseZipFile(str, downLoadPath);
    }
  }


  public void downloadXMLFiles(String tempZipPath, String tempTsvPath)
  {
    for(String str:lstDatasetURLs)
    {
      zip.zipURL(str, tempZipPath, tempTsvPath);
    }
  }


  // get the URLs of datasets having SDMX format
  public void getDatasetURLs(Element element)
  {
    NodeList nl = element.getElementsByTagName("nt:downloadLink");
    if(nl != null && nl.getLength() > 0)
    {
      for(int i = 0 ; i < nl.getLength();i++)
      {
        Element ele = (Element)nl.item(i);
        if(ele.getAttribute("format").equals("sdmx"))
        {
          if(!lstDatasetURLs.contains(ele.getTextContent()))
            lstDatasetURLs.add(ele.getTextContent());
          
        }
      }
    }
    
  }
  
  public void extractDatasetTitles()
  {
    Element element = xmlDocument.getDocumentElement();
    
    NodeList nl = element.getElementsByTagName("nt:leaf");
    if(nl != null && nl.getLength() > 0)
    {
      for(int i = 0 ; i < nl.getLength();i++)
      {
        Element ele = (Element)nl.item(i);
        if(ele.getAttribute("type").equals("dataset") || ele.getAttribute("type").equals("table"))
        {
          storeDatasetTitles(ele);
        }
      }
    }


  }
  
  public void storeDatasetTitles(Element element)
  {
    HashMap<String, String> hsh = new HashMap<String, String>();
    String code = "";
    
    NodeList nl = element.getElementsByTagName("nt:code");
    
    code = nl.item(0).getTextContent();
    
    nl = element.getElementsByTagName("nt:title");
    if(nl != null && nl.getLength() > 0)
    {
      for(int i = 0 ; i < nl.getLength();i++)
      {
        Element ele = (Element)nl.item(i);
        hsh.put(ele.getAttribute("language"), ele.getTextContent());
      }
    }


    toc.put(code, hsh);
  }
  
  private static void usage()
  {
    System.out.println("usage: ParseToC [parameters]");
    System.out.println();
    System.out.println("  -n num    No. of Dataset URLs to print. Default sets to 10.");
  }
  
  public void parseToC()
  {
    InputStream is = get_ToC_XMLStream();
    initObjects(is);
    parseDataSets();
    printResults();
  }
  
  public void RDFize(String downLoadPath)
  {
    InputStream is = get_ToC_XMLStream();
    initObjects(is);
    parseDataSets();
  }


  public void getDatasetTitles()
  {
    InputStream is = get_ToC_XMLStream();
    initObjects(is);
    extractDatasetTitles();
  }


  public void getDatasetTitles(String filePath)
  {
    initObjects(filePath);
    extractDatasetTitles();
  }
  
  public void downloadZip(String tempZipPath, String tempTsvPath)
  {
    InputStream is = get_ToC_XMLStream();
    initObjects(is);
    parseDataSets();
    downloadXMLFiles(tempZipPath, tempTsvPath);
  }
  
  public static void main(String[] args) throws Exception
  {
    ParseToC obj = new ParseToC();
    
    CommandLineParser parser = new BasicParser( );
    Options options = new Options( );
    options.addOption("h", "help", false, "Print this usage information");
    options.addOption("n", "num", true, "No. of Dataset URLs to print. Default sets to 10.");


    CommandLine commandLine = parser.parse( options, args );
    
    if( commandLine.hasOption('h') ) {
        usage();
        return;
     }
    
    if(commandLine.hasOption('n'))
      printDatasets = Integer.parseInt(commandLine.getOptionValue('n'));
    
    obj.parseToC();
  }
  
}
Source Code of org.deri.eurostat.toc.ParseToC

Related Classes of org.deri.eurostat.toc.ParseToC