Package org.apache.nutch.parse.feed

Source Code of org.apache.nutch.parse.feed.FeedParser

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.feed;

// JDK imports
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;

// APACHE imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.metadata.Feed;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.parse.ParserNotFound;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.EncodingDetector;
import org.apache.nutch.util.NutchConfiguration;
import org.xml.sax.InputSource;

// ROME imports
import com.sun.syndication.feed.synd.SyndCategory;
import com.sun.syndication.feed.synd.SyndContent;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.feed.synd.SyndPerson;
import com.sun.syndication.io.SyndFeedInput;

/**
*
* @author dogacan
* @author mattmann
* @since NUTCH-444
*
* <p>
* A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced links
* and content present in the feed.
* </p>
*
*/
public class FeedParser implements Parser {

  public static final String CHARSET_UTF8 = "charset=UTF-8";

  public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; "
      + CHARSET_UTF8;

  public static final Logger LOG = LoggerFactory
      .getLogger("org.apache.nutch.parse.feed");

  private Configuration conf;

  private ParserFactory parserFactory;

  private URLNormalizers normalizers;

  private URLFilters filters;

  private String defaultEncoding;

  /**
   * Parses the given feed and extracts out and parsers all linked items within
   * the feed, using the underlying ROME feed parsing library.
   *
   * @param content
   *          A {@link Content} object representing the feed that is being
   *          parsed by this {@link Parser}.
   *
   * @return A {@link ParseResult} containing all {@link Parse}d feeds that
   *         were present in the feed file that this {@link Parser} dealt with.
   *
   */
  public ParseResult getParse(Content content) {
    SyndFeed feed = null;
    ParseResult parseResult = new ParseResult(content.getUrl());

    EncodingDetector detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, defaultEncoding);
    try {
      InputSource input = new InputSource(new ByteArrayInputStream(content
          .getContent()));
      input.setEncoding(encoding);
      SyndFeedInput feedInput = new SyndFeedInput();
      feed = feedInput.build(input);
    } catch (Exception e) {
      // return empty parse
      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
          + StringUtils.stringifyException(e));
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    List entries = feed.getEntries();
    String feedLink = feed.getLink();
    try {
      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
      if (feedLink != null)
        feedLink = filters.filter(feedLink);
    } catch (Exception e) {
      feedLink = null;
    }

    for (Iterator i = entries.iterator(); i.hasNext();) {
      SyndEntry entry = (SyndEntry) i.next();
      addToMap(parseResult, feed, feedLink, entry, content);
    }

    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());

    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));

    return parseResult;
  }

  /**
   *
   * Sets the {@link Configuration} object for this {@link Parser}. This
   * {@link Parser} expects the following configuration properties to be set:
   *
   * <ul>
   * <li>URLNormalizers - properties in the configuration object to set up the
   * default url normalizers.</li>
   * <li>URLFilters - properties in the configuration object to set up the
   * default url filters.</li>
   * </ul>
   *
   * @param conf
   *          The Hadoop {@link Configuration} object to use to configure this
   *          {@link Parser}.
   *
   */
  public void setConf(Configuration conf) {
    this.conf = conf;
    this.parserFactory = new ParserFactory(conf);
    this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
    this.filters = new URLFilters(conf);
    this.defaultEncoding =
      conf.get("parser.character.encoding.default", "windows-1252");
  }

  /**
   *
   * @return The {@link Configuration} object used to configure this
   *         {@link Parser}.
   */
  public Configuration getConf() {
    return this.conf;
  }

  /**
   * Runs a command line version of this {@link Parser}.
   *
   * @param args
   *          A single argument (expected at arg[0]) representing a path on the
   *          local filesystem that points to a feed file.
   *
   * @throws Exception
   *           If any error occurs.
   */
  public static void main(String[] args) throws Exception {
    if (args.length != 1) {
      System.err.println("Usage: FeedParser <feed>");
      System.exit(1);
    }
    String name = args[0];
    String url = "file:" + name;
    Configuration conf = NutchConfiguration.create();
    FeedParser parser = new FeedParser();
    parser.setConf(conf);
    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
        "application/rss+xml", new Metadata(), conf));
    for (Entry<Text, Parse> entry : parseResult) {
      System.out.println("key: " + entry.getKey());
      Parse parse = entry.getValue();
      System.out.println("data: " + parse.getData());
      System.out.println("text: " + parse.getText() + "\n");
    }
  }

  private void addToMap(ParseResult parseResult, SyndFeed feed,
      String feedLink, SyndEntry entry, Content content) {
    String link = entry.getLink(), text = null, title = null;
    Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
    Parse parse = null;
    SyndContent description = entry.getDescription();

    try {
      link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);

      if (link != null)
        link = filters.filter(link);
    } catch (Exception e) {
      e.printStackTrace();
      return;
    }

    if (link == null)
      return;

    title = stripTags(entry.getTitleEx());

    if (feedLink != null)
      parseMeta.set("feed", feedLink);

    addFields(parseMeta, contentMeta, feed, entry);

    // some item descriptions contain markup text in them,
    // so we temporarily set their content-type to parse them
    // with another plugin
    String contentType = contentMeta.get(Response.CONTENT_TYPE);

    if (description != null)
      text = description.getValue();

    if (text == null) {
      List contents = entry.getContents();
      StringBuilder buf = new StringBuilder();
      for (Iterator i = contents.iterator(); i.hasNext();) {
        SyndContent syndContent = (SyndContent) i.next();
        buf.append(syndContent.getValue());
      }
      text = buf.toString();
    }

    try {
      Parser parser = parserFactory.getParsers(contentType, link)[0];
      parse = parser.getParse(
          new Content(link, link, text.getBytes(), contentType, contentMeta,
              conf)).get(link);
    } catch (ParserNotFound e) { /* ignore */
    }

    if (parse != null) {
      ParseData data = parse.getData();
      data.getContentMeta().remove(Response.CONTENT_TYPE);
      mergeMetadata(data.getParseMeta(), parseMeta);
      parseResult.put(link, new ParseText(parse.getText()), new ParseData(
          ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data
              .getContentMeta(), data.getParseMeta()));
    } else {
      contentMeta.remove(Response.CONTENT_TYPE);
      parseResult.put(link, new ParseText(text), new ParseData(
          ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
          parseMeta));
    }

  }

  private static String stripTags(SyndContent c) {
    if (c == null)
      return "";

    String value = c.getValue();

    String[] parts = value.split("<[^>]*>");
    StringBuffer buf = new StringBuffer();

    for (String part : parts)
      buf.append(part);

    return buf.toString().trim();
  }

  private void addFields(Metadata parseMeta, Metadata contentMeta,
      SyndFeed feed, SyndEntry entry) {
    List authors = entry.getAuthors(), categories = entry.getCategories();
    Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate();
    String contentType = null;

    if (authors != null) {
      for (Object o : authors) {
        SyndPerson author = (SyndPerson) o;
        String authorName = author.getName();
        if (checkString(authorName)) {
          parseMeta.add(Feed.FEED_AUTHOR, authorName);
        }
      }
    } else {
      // getAuthors may return null if feed is non-atom
      // if so, call getAuthor to get Dublin Core module creator.
      String authorName = entry.getAuthor();
      if (checkString(authorName)) {
        parseMeta.set(Feed.FEED_AUTHOR, authorName);
      }
    }

    for (Iterator i = categories.iterator(); i.hasNext();) {
      parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i.next()).getName());
    }

    if (published != null) {
      parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
    }
    if (updated != null) {
      parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
    }

    SyndContent description = entry.getDescription();
    if (description != null) {
      contentType = description.getType();
    } else {
      // TODO: What to do if contents.size() > 1?
      List contents = entry.getContents();
      if (contents.size() > 0) {
        contentType = ((SyndContent) contents.get(0)).getType();
      }
    }

    if (checkString(contentType)) {
      // ROME may return content-type as html
      if (contentType.equals("html"))
        contentType = "text/html";
      else if (contentType.equals("xhtml"))
        contentType = "text/xhtml";
      contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8);
    } else {
      contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
    }

  }

  private void mergeMetadata(Metadata first, Metadata second) {
    for (String name : second.names()) {
      String[] values = second.getValues(name);
      for (String value : values) {
        first.add(name, value);
      }
    }
  }

  private boolean checkString(String s) {
    return s != null && !s.equals("");
  }

}
TOP

Related Classes of org.apache.nutch.parse.feed.FeedParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.