Source Code of com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer

/* Copyright (C) 2012 Intel Corporation.
 *     All rights reserved.
 *           
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 *
 * For more about this software visit:
 *      http://www.01.org/GraphBuilder 
 */
package com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.Scanner;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;


import org.apache.commons.collections.iterators.EmptyIterator;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;


import com.intel.hadoop.graphbuilder.graph.Edge;
import com.intel.hadoop.graphbuilder.graph.Vertex;
import com.intel.hadoop.graphbuilder.preprocess.inputformat.GraphTokenizer;
import com.intel.hadoop.graphbuilder.types.StringType;


public class WordCountGraphTokenizer implements
    GraphTokenizer<StringType, StringType, StringType> {


  private static final Logger LOG = Logger
      .getLogger(WordCountGraphTokenizer.class);


  @Override
  public void configure(JobConf job) {
    try {
      fs = FileSystem.get(job);
    } catch (IOException e1) {
      e1.printStackTrace();
    }


    String path = job.get("Dictionary");
    if (path != null) {
      try {
        loadDictionary(path);
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
  }


  @Override
  public Class vidClass() {
    return StringType.class;
  }


  @Override
  public Class vdataClass() {
    return StringType.class;
  }


  @Override
  public Class edataClass() {
    return StringType.class;
  }


  public void parse(String s) {
    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    factory.setNamespaceAware(true);
    DocumentBuilder builder;
    counts = new HashMap<String, Integer>();
    try {
      builder = factory.newDocumentBuilder();
      Document doc = builder.parse(new InputSource(new StringReader(s)));
      XPathFactory xfactory = XPathFactory.newInstance();
      XPath xpath = xfactory.newXPath();
      title = xpath.evaluate("//page/title/text()", doc);
      title = title.replaceAll("\\s", "_");
      // title = title.replaceAll("^[^a-zA-Z0-9]", "#");
      // title = title.replaceAll("[^a-zA-Z0-9.]", "_");
      id = xpath.evaluate("//page/id/text()", doc);
      String text = xpath.evaluate("//page/revision/text/text()", doc);


      if (!text.isEmpty()) {
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        TokenStream stream = analyzer.tokenStream(null, new StringReader(text));
        while (stream.incrementToken()) {
          String token = stream.getAttribute(TermAttribute.class).term();


          if (dictionary != null && !dictionary.contains(token))
            continue;


          if (counts.containsKey(token))
            counts.put(token, counts.get(token) + 1);
          else
            counts.put(token, 1);
        }
      }
    } catch (ParserConfigurationException e) {
      e.printStackTrace();
    } catch (SAXException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } catch (XPathExpressionException e) {
      e.printStackTrace();
    }
  }


  @Override
  public Iterator<Vertex<StringType, StringType>> getVertices() {
    ArrayList<Vertex<StringType, StringType>> vlist = new ArrayList<Vertex<StringType, StringType>>(
        counts.size() + 1);
    vlist.add(new Vertex<StringType, StringType>(new StringType(id),
        new StringType(title)));
    Iterator<String> iter = counts.keySet().iterator();
    while (iter.hasNext()) {
      vlist.add(new Vertex<StringType, StringType>(new StringType(iter.next()),
          new StringType()));
    }
    return vlist.iterator();
  }


  @Override
  public Iterator<Edge<StringType, StringType>> getEdges() {
    if (counts.isEmpty())
      return EmptyIterator.INSTANCE;


    ArrayList<Edge<StringType, StringType>> elist = new ArrayList<Edge<StringType, StringType>>(
        counts.size());
    Iterator<Entry<String, Integer>> iter = counts.entrySet().iterator();
    while (iter.hasNext()) {
      Entry<String, Integer> e = iter.next();
      elist.add(new Edge<StringType, StringType>(new StringType(id),
          new StringType(e.getKey()), new StringType(e.getValue().toString())));
    }
    return elist.iterator();
  }


  private void loadDictionary(String path) throws IOException {
    FileStatus[] stats = fs.listStatus(new Path(path));
    dictionary = new HashSet<String>();
    for (FileStatus stat : stats) {
      LOG.debug(("Load dictionary: " + stat.getPath().getName()));
      Scanner sc = new Scanner(new BufferedReader(new InputStreamReader(
          fs.open(stat.getPath()))));
      while (sc.hasNextLine()) {
        String line = sc.nextLine();
        dictionary.add(line);
      }
    }
  }


  private String title;
  private String id;
  private HashMap<String, Integer> counts;
  private FileSystem fs;
  private HashSet<String> dictionary;


}
Source Code of com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer

Related Classes of com.intel.hadoop.graphbuilder.demoapps.wikipedia.docwordgraph.WordCountGraphTokenizer