Package com.intel.hadoop.graphbuilder.demoapps.wikipedia.linkgraph

Source Code of com.intel.hadoop.graphbuilder.demoapps.wikipedia.linkgraph.LinkGraphTokenizer

/* Copyright (C) 2012 Intel Corporation.
*     All rights reserved.
*          
*  Licensed under the Apache License, Version 2.0 (the "License");
*  you may not use this file except in compliance with the License.
*  You may obtain a copy of the License at
*
*       http://www.apache.org/licenses/LICENSE-2.0
*
*   Unless required by applicable law or agreed to in writing, software
*   distributed under the License is distributed on an "AS IS" BASIS,
*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*   See the License for the specific language governing permissions and
*   limitations under the License.
*
* For more about this software visit:
*      http://www.01.org/GraphBuilder
*/
package com.intel.hadoop.graphbuilder.demoapps.wikipedia.linkgraph;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.collections.iterators.EmptyIterator;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.intel.hadoop.graphbuilder.graph.Edge;
import com.intel.hadoop.graphbuilder.graph.Vertex;
import com.intel.hadoop.graphbuilder.preprocess.inputformat.GraphTokenizer;
import com.intel.hadoop.graphbuilder.types.EmptyType;
import com.intel.hadoop.graphbuilder.types.StringType;

public class LinkGraphTokenizer implements
    GraphTokenizer<StringType, EmptyType, EmptyType> {
  private static final Logger LOG = Logger.getLogger(LinkGraphTokenizer.class);

  public LinkGraphTokenizer() throws ParserConfigurationException {
    factory = DocumentBuilderFactory.newInstance();
    factory.setNamespaceAware(true);
    builder = factory.newDocumentBuilder();
    XPathFactory xfactory = XPathFactory.newInstance();
    xpath = xfactory.newXPath();

    vlist = new ArrayList<Vertex<StringType, EmptyType>>();
    elist = new ArrayList<Edge<StringType, EmptyType>>();
    links = new ArrayList<String>();
  }

  @Override
  public void configure(JobConf job) {
  }

  @Override
  public Class vidClass() {
    return StringType.class;
  }

  @Override
  public Class vdataClass() {
    return EmptyType.class;
  }

  @Override
  public Class edataClass() {
    return EmptyType.class;
  }

  public void parse(String s) {

    try {
      Document doc = builder.parse(new InputSource(new StringReader(s)));
      title = xpath.evaluate("//page/title/text()", doc);
      title = title.replaceAll("\\s", "_");
      id = xpath.evaluate("//page/id/text()", doc);
      String text = xpath.evaluate("//page/revision/text/text()", doc);
      parseLinks(text);
    } catch (SAXException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } catch (XPathExpressionException e) {
      e.printStackTrace();
    }
  }

  public Iterator<Vertex<StringType, EmptyType>> getVertices() {
    vlist.clear();
    vlist.add(new Vertex<StringType, EmptyType>(new StringType(title),
        EmptyType.INSTANCE));
    for (String link : links)
      vlist.add(new Vertex<StringType, EmptyType>(new StringType(link),
          EmptyType.INSTANCE));
    return vlist.iterator();
  }

  @Override
  public Iterator<Edge<StringType, EmptyType>> getEdges() {
    if (links.isEmpty())
      return EmptyIterator.INSTANCE;

    elist.clear();
    Iterator<String> iter = links.iterator();
    while (iter.hasNext()) {
      elist.add(new Edge<StringType, EmptyType>(new StringType(title),
          new StringType(iter.next()), EmptyType.INSTANCE));
    }
    return elist.iterator();
  }

  /** This function is taken and modified from wikixmlj WikiTextParser */
  private void parseLinks(String text) {
    links.clear();
    Pattern catPattern = Pattern
        .compile("\\[\\[(.*?)\\]\\]", Pattern.MULTILINE);
    Matcher matcher = catPattern.matcher(text);
    while (matcher.find()) {
      String[] temp = matcher.group(1).split("\\|");
      if (temp == null || temp.length == 0)
        continue;
      String link = temp[0];
      if (!link.replaceAll("\\s", "").isEmpty() && !link.contains(":")) {
        links.add(link.replaceAll("\\s", "_"));
      }
    }
  }

  private String id;
  private String title;
  private List<String> links;
  private ArrayList<Vertex<StringType, EmptyType>> vlist;
  private ArrayList<Edge<StringType, EmptyType>> elist;

  private DocumentBuilderFactory factory;
  private DocumentBuilder builder;
  private XPath xpath;

}
TOP

Related Classes of com.intel.hadoop.graphbuilder.demoapps.wikipedia.linkgraph.LinkGraphTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.