Package it.cnr.isti.hpc.wikipedia.parser

Source Code of it.cnr.isti.hpc.wikipedia.parser.ArticleParser

/**
*  Copyright 2013 Diego Ceccarelli
*
*  Licensed under the Apache License, Version 2.0 (the "License");
*  you may not use this file except in compliance with the License.
*  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package it.cnr.isti.hpc.wikipedia.parser;

import it.cnr.isti.hpc.wikipedia.article.Article;
import it.cnr.isti.hpc.wikipedia.article.Article.Type;
import it.cnr.isti.hpc.wikipedia.article.Language;
import it.cnr.isti.hpc.wikipedia.article.Link;
import it.cnr.isti.hpc.wikipedia.article.Table;
import it.cnr.isti.hpc.wikipedia.article.Template;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import de.tudarmstadt.ukp.wikipedia.parser.Content;
import de.tudarmstadt.ukp.wikipedia.parser.ContentElement;
import de.tudarmstadt.ukp.wikipedia.parser.DefinitionList;
import de.tudarmstadt.ukp.wikipedia.parser.NestedList;
import de.tudarmstadt.ukp.wikipedia.parser.NestedListContainer;
import de.tudarmstadt.ukp.wikipedia.parser.Paragraph;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import de.tudarmstadt.ukp.wikipedia.parser.Span;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser;

/**
* Generates a Mediawiki parser given a language, (it will expect to find a
* locale file in <tt>src/main/resources/</tt>).
*
* @see Locale
*
* @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
*
*         Created on Feb 14, 2013
*/
public class ArticleParser {

  static MediaWikiParserFactory parserFactory = new MediaWikiParserFactory();

  private static final Logger logger = LoggerFactory
      .getLogger(ArticleParser.class);

  /** the language (used for the locale) default is English **/
  private String lang = Language.EN;

  static int shortDescriptionLength = 500;
  private List<String> redirects;

  private MediaWikiParser parser;
  private Locale locale;

  public ArticleParser(String lang) {
    this.lang = lang;
    parser = parserFactory.getParser(lang);
    locale = new Locale(lang);
    redirects = locale.getRedirectIdentifiers();

  }

  public ArticleParser() {
    parser = parserFactory.getParser(lang);
    locale = new Locale(lang);
    redirects = locale.getRedirectIdentifiers();

  }

  public void parse(Article article, String mediawiki) {
    ParsedPage page = parser.parse(mediawiki);
    setRedirect(article, mediawiki);

    parse(article, page);

  }

  private void parse(Article article, ParsedPage page) {
    article.setLang(lang);
    setWikiTitle(article);
    if (page == null) {
      logger.warn("page is null for article {}", article.getTitle());
    } else {
      setParagraphs(article, page);
      // setShortDescription(article);
      setTemplates(article, page);
      setLinks(article, page);
      setCategories(article, page);
      setHighlights(article, page);
      setSections(article, page);
      setTables(article, page);
      setEnWikiTitle(article, page);
      setLists(article, page);
    }
    setRedirect(article);
    setDisambiguation(article);
    setIsList(article);
  }

  // /**
  // * @param article
  // */
  // private void setShortDescription(Article article) {
  // StringBuilder sb = new StringBuilder();
  // for (String paragraph : article.getParagraphs()) {
  // paragraph = removeTemplates(paragraph);
  // sb.append(paragraph);
  // if (sb.length() > shortDescriptionLength) {
  // break;
  // }
  // }
  // if (sb.length() > shortDescriptionLength) {
  // sb.setLength(shortDescriptionLength);
  // int pos = sb.lastIndexOf(" ");
  // sb.setLength(pos);
  // }
  // article.setShortDescription(sb.toString());
  //
  // }

  // private final static String templatePattern = "TEMPLATE\\[[^]]+\\]";
  //
  // private static String removeTemplates(String paragraph) {
  // paragraph = paragraph.replaceAll(templatePattern, " ");
  //
  // return paragraph;
  // }

  /**
   * @param article
   */
  private void setWikiTitle(Article article) {
    article.setWikiTitle(Article.getTitleInWikistyle(article.getTitle()));

  }

  /**
   * @param article
   */
  private void setIsList(Article article) {
    for (String list : locale.getListIdentifiers()) {
      if (StringUtils.startsWithIgnoreCase(article.getTitle(), list)) {
        article.setType(Type.LIST);
      }
    }

  }

  private void setRedirect(Article article) {
    if (!article.getRedirect().isEmpty())
      return;
    List<List<String>> lists = article.getLists();
    if ((!lists.isEmpty()) && (! lists.get(0).isEmpty())) {
      // checking only first item in first list
      String line = lists.get(0).get(0);

      for (String redirect : redirects) {
        if (StringUtils.startsWithIgnoreCase(line, redirect)) {
          int pos = line.indexOf(' ');
          if (pos < 0)
            return;
          String red = line.substring(pos).trim();
          red = Article.getTitleInWikistyle(red);
          article.setRedirect(red);
          article.setType(Type.REDIRECT);
          return;

        }
      }
    }
  }

  // for (List<String> lists : article.getLists()) {
  // for (String line : lists) {
  // for (String redirect : redirects) {
  // if (StringUtils.startsWithIgnoreCase(line, redirect)) {
  // int pos = line.indexOf(' ');
  // if (pos < 0)
  // return;
  // String red = line.substring(pos).trim();
  // red = Article.getTitleInWikistyle(red);
  // article.setRedirect(red);
  // article.setType(Type.REDIRECT);
  // return;
  //
  // }
  // }
  // }
  // }

  /**
   * @param article
   * @param page
   */
  private void setRedirect(Article article, String mediawiki) {
    for (String redirect : redirects)
      if (StringUtils.startsWithIgnoreCase(mediawiki, redirect)) {
        int start = mediawiki.indexOf("[[") + 2;
        int end = mediawiki.indexOf("]]");
        if (start < 0 || end < 0) {
          logger.warn("cannot find the redirect {}\n mediawiki: {}",
              article.getTitle(), mediawiki);
          continue;
        }
        String r = Article.getTitleInWikistyle(mediawiki.substring(
            start, end));
        article.setRedirect(r);
        article.setType(Type.REDIRECT);
      }

  }

  /**
   * @param page
   */
  private void setTables(Article article, ParsedPage page) {
    List<Table> tables = new ArrayList<Table>();

    for (de.tudarmstadt.ukp.wikipedia.parser.Table t : page.getTables()) {
      // System.out.println(t);

      int i = 0;
      String title = "";
      if (t.getTitleElement() != null) {
        title = t.getTitleElement().getText();
        if (title == null)
          title = "";
      }
      Table table = new Table(title);
      List<String> currentRow = new ArrayList<String>();
      List<Content> contentList = t.getContentList();
      for (@SuppressWarnings("unused")
      Content c : contentList) {

        int row, col;
        String elem = "";

        try {

          col = t.getTableElement(i).getCol();
          row = t.getTableElement(i).getRow();
          elem = t.getTableElement(i).getText();

        } catch (IndexOutOfBoundsException e) {
          // logger.(
          // "Error creating table {}, Index out of bound - content = {}",
          // table.getName(), c.getText());
          break;

        }
        if (row > 0 && col == 0) {
          if ((currentRow.size() == 1)
              && (currentRow.get(0).equals(table.getName()))) {
            currentRow = new ArrayList<String>();
          } else {
            if (!currentRow.isEmpty())
              table.addRow(currentRow);
            currentRow = new ArrayList<String>();
          }

        }
        currentRow.add(elem);
        i++;
      }
      table.addRow(currentRow);
      tables.add(table);
    }

    article.setTables(tables);

  }

  protected void setEnWikiTitle(Article article, ParsedPage page) {
    if (article.isLang(Language.EN)) {
      return;
    }
    try {
      if (page.getLanguages() == null) {
        article.setEnWikiTitle("");
        return;
      }
    } catch (NullPointerException e) {
      // FIXME title is always null!
      logger.warn("no languages for page {} ", article.getTitle());
      return;
    }
    for (de.tudarmstadt.ukp.wikipedia.parser.Link l : page.getLanguages())
      if (l.getText().startsWith("en:")) {
        article.setEnWikiTitle(l.getTarget().substring(3));
        break;
      }

  }

  /**
   * @param page
   */
  private void setSections(Article article, ParsedPage page) {
    List<String> sections = new ArrayList<String>(10);
    for (Section s : page.getSections()) {

      if (s == null || s.getTitle() == null)
        continue;
      sections.add(s.getTitle());
    }
    article.setSections(sections);

  }

  private void setLinks(Article article, ParsedPage page) {

    List<Link> links = new ArrayList<Link>(10);
    List<Link> elinks = new ArrayList<Link>(10);

    for (de.tudarmstadt.ukp.wikipedia.parser.Link t : page.getLinks()) {
      if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.INTERNAL) {

        links.add(new Link(t.getTarget(), t.getText()));

      }
      if (t.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.EXTERNAL) {

        elinks.add(new Link(t.getTarget(), t.getText()));

      }
    }
    article.setLinks(links);
    article.setExternalLinks(elinks);
  }

  private void setTemplates(Article article, ParsedPage page) {
    List<Template> templates = new ArrayList<Template>(10);

    for (de.tudarmstadt.ukp.wikipedia.parser.Template t : page
        .getTemplates()) {
      List<String> templateParameters = t.getParameters();
      parseTemplatesSchema(article, templateParameters);

      if (t.getName().toLowerCase().startsWith("infobox")) {
        article.setInfobox(new Template(t.getName(), templateParameters));
      } else {
        templates.add(new Template(t.getName(), templateParameters));
      }
    }
    article.setTemplates(templates);

  }

  /**
   *
   * @param templateParameters
   */
  private void parseTemplatesSchema(Article article,
      List<String> templateParameters) {
    List<String> schema = new ArrayList<String>(10);

    for (String s : templateParameters) {
      try {
        if (s.contains("=")) {
          String attributeName = s.split("=")[0].trim().toLowerCase();
          schema.add(attributeName);
        }

      } catch (Exception e) {
        continue;
      }
    }
    article.addTemplatesSchema(schema);

  }

  private void setCategories(Article article, ParsedPage page) {
    ArrayList<Link> categories = new ArrayList<Link>(10);

    for (de.tudarmstadt.ukp.wikipedia.parser.Link c : page.getCategories()) {

      categories.add(new Link(c.getTarget(), c.getText()));
    }
    article.setCategories(categories);

  }

  private void setHighlights(Article article, ParsedPage page) {
    List<String> highlights = new ArrayList<String>(20);

    for (Paragraph p : page.getParagraphs()) {
      for (Span t : p.getFormatSpans(Content.FormatType.BOLD)) {
        highlights.add(t.getText(p.getText()));
      }
      for (Span t : p.getFormatSpans(Content.FormatType.ITALIC)) {
        highlights.add(t.getText(p.getText()));
      }

    }
    article.setHighlights(highlights);

  }

  private void setParagraphs(Article article, ParsedPage page) {
    List<String> paragraphs = new ArrayList<String>(page.nrOfParagraphs());
    for (Paragraph p : page.getParagraphs()) {
      String text = p.getText();
      // text = removeTemplates(text);
      text = text.replace("\n", " ").trim();
      if (!text.isEmpty())
        paragraphs.add(text);
    }
    article.setParagraphs(paragraphs);
  }

  private void setLists(Article article, ParsedPage page) {
    List<List<String>> lists = new LinkedList<List<String>>();
    for (DefinitionList dl : page.getDefinitionLists()) {
      List<String> l = new ArrayList<String>();
      for (ContentElement c : dl.getDefinitions()) {
        l.add(c.getText());
      }
      lists.add(l);
    }
    for (NestedListContainer dl : page.getNestedLists()) {
      List<String> l = new ArrayList<String>();
      for (NestedList nl : dl.getNestedLists())
        l.add(nl.getText());
      lists.add(l);
    }
    article.setLists(lists);

  }

  private void setDisambiguation(Article a) {

    for (String disambiguation : locale.getDisambigutionIdentifiers()) {
      if (StringUtils.containsIgnoreCase(a.getTitle(), disambiguation)) {
        a.setType(Type.DISAMBIGUATION);
        return;
      }
      for (Template t : a.getTemplates()) {
        if (StringUtils.equalsIgnoreCase(t.getName(), disambiguation)) {
          a.setType(Type.DISAMBIGUATION);
          return;

        }
      }

    }
  }

}
TOP

Related Classes of it.cnr.isti.hpc.wikipedia.parser.ArticleParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.