Package org.apache.marmotta.commons.sesame.rio.rss

Source Code of org.apache.marmotta.commons.sesame.rio.rss.AtomParser

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.marmotta.commons.sesame.rio.rss;

import com.google.common.base.Preconditions;
import com.sun.syndication.feed.WireFeed;
import com.sun.syndication.feed.atom.*;
import com.sun.syndication.feed.module.DCModule;
import com.sun.syndication.feed.module.Module;
import com.sun.syndication.feed.module.SyModule;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.WireFeedInput;

import org.apache.marmotta.commons.sesame.rio.rss.AtomFormat;
import org.openrdf.model.Resource;
import org.openrdf.model.URI;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.rometools.feed.module.content.ContentModule;
import org.rometools.feed.module.georss.GeoRSSModule;
import org.rometools.feed.module.mediarss.MediaEntryModule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;

/**
* Parse Atom feed into RDF. Uses the following vocabularies:
* <ul>
*     <li>dcterms for representing most metadata about feeds and entries</li>
*     <li>sioc for type information and relation between concepts</li>
*     <li>skos for representing categories associated with items or channels</li>
*     <li>media ontology for representing information from the mediarss extension</li>
*     <li>wgs84 geo ontology for representing information from the georss extension</li>
* </ul>
* <p/>
* Author: Sebastian Schaffert
*/
public class AtomParser extends FeedParserBase {


    private static Logger log = LoggerFactory.getLogger(AtomParser.class);


    /**
     * Creates a new RDFParserBase that will use a {@link org.openrdf.model.impl.ValueFactoryImpl} to
     * create RDF model objects.
     */
    public AtomParser() {
        this(new ValueFactoryImpl());
    }

    /**
     * Creates a new RDFParserBase that will use the supplied ValueFactory to
     * create RDF model objects.
     *
     * @param valueFactory A ValueFactory.
     */
    public AtomParser(ValueFactory valueFactory) {
        super(valueFactory);
        this.valueFactory = valueFactory;
    }



    /**
     * Gets the RDF format that this parser can parse.
     */
    @Override
    public RDFFormat getRDFFormat() {
        return AtomFormat.FORMAT;
    }


    /**
     * Parses the data from the supplied InputStream, using the supplied baseURI
     * to resolve any relative URI references.
     *
     * @param in      The InputStream from which to read the data.
     * @param baseURI The URI associated with the data in the InputStream.
     * @throws java.io.IOException If an I/O error occurred while data was read from the InputStream.
     * @throws org.openrdf.rio.RDFParseException
     *                             If the parser has found an unrecoverable parse error.
     * @throws org.openrdf.rio.RDFHandlerException
     *                             If the configured statement handler has encountered an
     *                             unrecoverable error.
     */
    @Override
    public void parse(InputStream in, String baseURI) throws IOException, RDFParseException, RDFHandlerException {
        Preconditions.checkNotNull(baseURI);

        setBaseURI(baseURI);

        WireFeedInput input = new WireFeedInput();
        try {
            WireFeed feed = input.build(new InputSource(in));
            if(feed instanceof Feed) {
                parseFeed((Feed) feed);
            } else {
                throw new RDFParseException("data stream is not an RSS feed");
            }
        } catch (FeedException e) {
            throw new RDFParseException(e);
        }
    }

    /**
     * Parses the data from the supplied Reader, using the supplied baseURI to
     * resolve any relative URI references.
     *
     * @param reader  The Reader from which to read the data.
     * @param baseURI The URI associated with the data in the InputStream.
     * @throws java.io.IOException If an I/O error occurred while data was read from the InputStream.
     * @throws org.openrdf.rio.RDFParseException
     *                             If the parser has found an unrecoverable parse error.
     * @throws org.openrdf.rio.RDFHandlerException
     *                             If the configured statement handler has encountered an
     *                             unrecoverable error.
     */
    @Override
    public void parse(Reader reader, String baseURI) throws IOException, RDFParseException, RDFHandlerException {
        Preconditions.checkNotNull(baseURI);

        setBaseURI(baseURI);

        WireFeedInput input = new WireFeedInput();
        try {
            WireFeed feed = input.build(reader);
            if(feed instanceof Feed) {
                parseFeed((Feed) feed);
            } else {
                throw new RDFParseException("data stream is not an RSS feed");
            }
        } catch (FeedException e) {
            throw new RDFParseException(e);
        }
    }



    private void parseFeedEntry(final Entry entry, final Resource r_feed) throws RDFParseException, RDFHandlerException {

        final String entryURI = entry.getId();

        URI r_entry = createURI(entryURI);
        URI rdf_type = createURI(NS_RDF + "type");


        // add type sioc:Post
        rdfHandler.handleStatement(createStatement(r_entry, rdf_type, createURI(NS_SIOC + "Post")));

        // add as sioc:container_of from parent feed
        rdfHandler.handleStatement(createStatement(r_feed, createURI(NS_SIOC + "container_of"), r_entry));
        rdfHandler.handleStatement(createStatement(r_entry, createURI(NS_SIOC + "has_container"), r_feed));

        // for each link we create a sioc:link
        for(Link link : entry.getAlternateLinks()) {
            createUrlProperty(r_entry,NS_SIOC + "link", resolveURI(link.getHref()));
        }

        // add all authors as dc:creator
        for(Person person : entry.getAuthors()) {
            parsePerson(r_entry, person, "creator");
        }

        for(Object category : entry.getCategories()) {
            parseCategory(r_entry, (Category)category);
        }

        for(Content content : entry.getContents()) {
            createStringProperty(r_entry, NS_RSS_CONTENT + "encoded", content.getValue());
            createStringProperty(r_entry, NS_RSS_CONTENT + "format", content.getType());
            createStringProperty(r_entry, NS_DC_TERMS + "description",content.getValue());
        }

        // add all authors as dc:creator
        for(Person person : entry.getContributors()) {
            parsePerson(r_entry, person, "contributor");
        }

        createDateProperty(r_entry, NS_DC_TERMS + "created", entry.getCreated());

        // ignore foreign markup

        createDateProperty(r_entry, NS_DC_TERMS + "issued", entry.getPublished());
        createDateProperty(r_entry, NS_DC_TERMS + "modified", entry.getUpdated());


        // GUID is sometimes a URL but the documentation says this cannot be guaranteed, so we use dc:identifier
        createStringProperty(r_entry, NS_DC_TERMS + "identifier", entry.getId());


        for(Object module : entry.getModules()) {
            if(module instanceof DCModule) {
                parseDCModule(r_entry, (DCModule)module);
            } else if(module instanceof GeoRSSModule) {
                parseGeoModule(r_entry, (GeoRSSModule)module);
            } else if(module instanceof MediaEntryModule) {
                parseMediaModule(r_entry, (MediaEntryModule)module);
            } else if(module instanceof ContentModule) {
                parseContentModule(r_entry, (ContentModule)module);
            } else {
                log.warn("module {} not supported yet", module.toString());
            }

            // TODO: add support for more modules!
        }

        // for each link we create a sioc:link
        for(Link link : entry.getOtherLinks()) {
            createUrlProperty(r_entry,NS_SIOC + "link", resolveURI(link.getHref()));
        }

        // copyright information
        createStringProperty(r_entry, NS_DC_TERMS + "rights", entry.getRights());

        // if the source is present, we link just to the URL using dc:source and ignore the text
        if(entry.getSource() != null) {
            createUrlProperty(r_entry, NS_DC_TERMS + "source", entry.getSource().getId());
        }

        if(entry.getSummary() != null) {
            createStringProperty(r_entry, NS_DC_TERMS + "abstract",entry.getSummary().getValue());
        }

        // title is dc:title
        createStringProperty(r_entry, NS_DC_TERMS + "title", entry.getTitle());

        log.debug("parsed Atom item {}", r_entry.stringValue());
    }

    /**
     * Import data from an RSS or atom feed using the ROME SyndFeed representation.
     *
     * @param feed the ROME rss/atom feed representation
     * @return count of imported documents
     */
    private void parseFeed(final Feed feed) throws RDFParseException, RDFHandlerException {
        if (log.isInfoEnabled()) {
            log.info("importing entries from {} feed '{}' found at '{}'",new Object[] {feed.getFeedType(),feed.getTitle(),feed.getId()});
        }

        final String feedUri = feed.getId();
        if (feedUri == null) {
            log.error("feed '{}' has neither uri nor link to reference", feed.getTitle());
            return;
        }

        // we set some namespaces first
        setNamespace(NS_DC_TERMS,"dcterms");
        setNamespace(NS_RSS_SY,"sy");
        setNamespace(NS_RSS_CONTENT,"content");
        setNamespace(NS_SIOC,"sioc");

        URI r_feed = createURI(feedUri);
        URI rdf_type = createURI(NS_RDF + "type");

        // add type sioc:Forum
        rdfHandler.handleStatement(createStatement(r_feed, rdf_type, createURI(NS_SIOC + "Forum")));
        createUrlProperty(r_feed, NS_SIOC + "feed", feedUri);

        // for each link we create a sioc:link
        for(Link link : feed.getAlternateLinks()) {
            createUrlProperty(r_feed,NS_SIOC + "link", resolveURI(link.getHref()));
        }

        // add all authors as dc:creator
        for(Person person : feed.getAuthors()) {
            parsePerson(r_feed, person, "creator");
        }


        // add all categories that are present
        for(Object category : feed.getCategories()) {
            parseCategory(r_feed, (Category) category);
        }

        // add all contributors as dc:contributor
        for(Person person : feed.getAuthors()) {
            parsePerson(r_feed, person, "contributor");
        }


        // add dc:creator to point to the software used for generating feed
        createStringProperty(r_feed, NS_DC_TERMS + "provenance", feed.getGenerator().getValue());

        // add foaf:depiction in case there is an image
        if(feed.getIcon() != null) {
            createUrlProperty(r_feed, NS_FOAF + "thumbnail", resolveURI(feed.getIcon()));
        }

        // add all feed items
        for(Entry item : feed.getEntries()) {
            parseFeedEntry(item, r_feed);
        }

        // add dc:language for feed.getLanguage()
        createStringProperty(r_feed, NS_DC_TERMS + "language", feed.getLanguage());

        // add foaf:depiction in case there is an image
        if(feed.getLogo() != null) {
            createUrlProperty(r_feed, NS_FOAF + "logo", resolveURI(feed.getLogo()));
        }



        for(Module module : feed.getModules()) {
            if(module instanceof SyModule) {
                SyModule syModule = (SyModule)module;
                createStringProperty(r_feed,NS_RSS_SY + "updatePeriod", syModule.getUpdatePeriod());
                createIntProperty(r_feed, NS_RSS_SY + "updateFrequency", syModule.getUpdateFrequency());
                createDateProperty(r_feed, NS_RSS_SY + "updateBase", syModule.getUpdateBase());
            } else if(module instanceof DCModule) {
                parseDCModule(r_feed, (DCModule)module);
            }
        }

        // for each link we create a sioc:link
        for(Link link : feed.getOtherLinks()) {
            createUrlProperty(r_feed,NS_SIOC + "link", resolveURI(link.getHref()));
        }
        // add dc:rights for feed.getCopyright()
        createStringProperty(r_feed, NS_DC_TERMS + "rights", feed.getRights());

        // add dc:description for feed.getDescription()
        if(feed.getSubtitle() != null) {
            createStringProperty(r_feed, NS_DC_TERMS + "description", feed.getSubtitle().getValue());
        }


        // textinput: we skip it, the documentation says:
        // "The purpose of the <textInput> element is something of a mystery. You can use it to specify a
        // search engine box. Or to allow a reader to provide feedback. Most aggregators ignore it. "

        createStringProperty(r_feed, NS_DC_TERMS + "title", feed.getTitle());

        // add dc:created and dc:issued for update date
        createDateProperty(r_feed, NS_DC_TERMS + "created", feed.getUpdated());
        createDateProperty(r_feed, NS_DC_TERMS + "issued", feed.getUpdated());

        log.info("importing Atom feed finished successfully.");
    }

    protected void parseCategory(Resource resource, Category category) throws RDFHandlerException, RDFParseException {
        if(category.getTerm() == null) {
            return;
        }

        try {
            Resource skosConcept;
            if(category.getScheme() != null ) {
                // create a skos:Concept with the domain as namespace and a local name derived from the value, add it as sioc:topic
                String localName = URLEncoder.encode(category.getTerm(), "UTF-8");
                String namespace = category.getScheme();
                skosConcept = createURI(namespace+(namespace.endsWith("/") || namespace.endsWith("#")?"":"/")+localName);
            } else  {
                // create a skos:Concept with the baseUri as namespace and a local name derived from the value, add it as sioc:topic
                String localName = URLEncoder.encode(category.getTerm(), "UTF-8");
                skosConcept = resolveURI(localName);
            }
            createUrlProperty(skosConcept,NS_RDF + "type", NS_SKOS+"Concept");
            if(category.getLabel() != null) {
                createStringProperty(skosConcept, NS_SKOS + "prefLabel", category.getLabel());
            } else {
                createStringProperty(skosConcept, NS_SKOS + "prefLabel", category.getTerm());
            }
            rdfHandler.handleStatement(createStatement(resource,createURI(NS_SIOC + "topic"),skosConcept));
        } catch (UnsupportedEncodingException e) {
            throw new RDFParseException(e);
        }


        // add category value as dc:subject
        if(category.getLabel() != null) {
            createStringProperty(resource, NS_DC_TERMS + "subject", category.getLabel());
        } else {
            createStringProperty(resource, NS_DC_TERMS + "subject", category.getTerm());
        }

    }


    protected void parsePerson(Resource r_entry, Person person, String relation) throws RDFParseException, RDFHandlerException {
        if("creator".equals(relation) && (person.getUri() != null || person.getEmail() != null)) {
            String personUri = person.getUri() != null ? person.getUri() : "mailto:"+person.getEmail();
            Resource r_person = createURI(personUri);
            createStringProperty(r_person, NS_FOAF + "name", person.getName());
            if(person.getEmail() != null) {
                createUrlProperty(r_person, NS_FOAF + "mbox", "mailto:"+person.getEmail());
            }
            createUrlProperty(r_person, NS_FOAF + "homepage", person.getUri());

            rdfHandler.handleStatement(createStatement(r_entry, createURI(NS_FOAF + "maker"), r_person));
            rdfHandler.handleStatement(createStatement(r_person, createURI(NS_FOAF + "made"), r_entry));
        }
        createStringProperty(r_entry,NS_DC_TERMS + relation,person.getName());
    }

}
TOP

Related Classes of org.apache.marmotta.commons.sesame.rio.rss.AtomParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.