Package org.apache.any23.extractor.html

Source Code of org.apache.any23.extractor.html.HReviewExtractor

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*  http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.any23.extractor.html;

import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.TagSoupExtractionResult;
import org.apache.any23.vocab.DCTerms;
import org.apache.any23.vocab.Review;
import org.apache.any23.vocab.VCard;
import org.openrdf.model.BNode;
import org.openrdf.model.Resource;
import org.openrdf.model.vocabulary.RDF;
import org.w3c.dom.Node;

import java.util.List;

import static org.apache.any23.extractor.html.HTMLDocument.TextField;

/**
* Extractor for the <a href="http://microformats.org/wiki/hreview">hReview</a>
* microformat.
*
* @author Gabriele Renzi
*/
public class HReviewExtractor extends EntityBasedMicroformatExtractor {

    private static final Review  vREVIEW  = Review.getInstance();
    private static final VCard   vVCARD   = VCard.getInstance();
    private static final DCTerms vDCTERMS = DCTerms.getInstance();

    @Override
    public ExtractorDescription getDescription() {
        return HReviewExtractorFactory.getDescriptionInstance();
    }

    @Override
    protected String getBaseClassName() {
        return "hreview";
    }

    @Override
    protected void resetExtractor() {
        // Empty.
    }

    @Override
    protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
        BNode rev = getBlankNodeFor(node);
        out.writeTriple(rev, RDF.TYPE, vREVIEW.Review);
        final HTMLDocument fragment = new HTMLDocument(node);
        addRating(fragment, rev);
        addSummary(fragment, rev);
        addTime(fragment, rev);
        addType(fragment, rev);
        addDescription(fragment, rev);
        addItem(fragment, rev);
        addReviewer(fragment, rev);

        final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
        tser.addResourceRoot(
                DomUtils.getXPathListForNode(node),
                rev,
                this.getClass()
        );

        return true;
    }

    private void addType(HTMLDocument doc, Resource rev) {
        TextField value = doc.getSingularTextField("type");
        conditionallyAddStringProperty(
                value.source(),
                rev, vREVIEW.type, value.value()
        );
    }

    private void addReviewer(HTMLDocument doc, Resource rev) {
        List<Node> nodes = doc.findAllByClassName("reviewer");
        if (nodes.size() > 0) {
            Node node0 = nodes.get(0);
            addBNodeProperty(
                    node0,
                    rev, vREVIEW.reviewer, getBlankNodeFor(node0)
            );
        }
    }

    private void addItem(HTMLDocument root, BNode rev) throws ExtractionException {
        List<Node> nodes = root.findAllByClassName("item");
        for (Node node : nodes) {
            Resource item = findDummy(new HTMLDocument(node));
            addBNodeProperty(
                    node,
                    item, vREVIEW.hasReview, rev
            );
        }
    }

    private Resource findDummy(HTMLDocument item) throws ExtractionException {
        Resource blank = getBlankNodeFor(item.getDocument());
        TextField val = item.getSingularTextField("fn");
        conditionallyAddStringProperty(
                val.source(),
                blank, vVCARD.fn, val.value()
        );
        final TextField url = item.getSingularUrlField("url");
        conditionallyAddResourceProperty(blank, vVCARD.url, getHTMLDocument().resolveURI(url.value()));
        TextField pics[] = item.getPluralUrlField("photo");
        for (TextField pic : pics) {
            addURIProperty(blank, vVCARD.photo, getHTMLDocument().resolveURI(pic.value()));
        }
        return blank;
    }

    private void addRating(HTMLDocument doc, Resource rev) {
        HTMLDocument.TextField value = doc.getSingularTextField("rating");
        conditionallyAddStringProperty(
                value.source(), rev, vREVIEW.rating, value.value()
        );
    }

    private void addSummary(HTMLDocument doc, Resource rev) {
        TextField value = doc.getSingularTextField("summary");
        conditionallyAddStringProperty(
                value.source(),
                rev, vREVIEW.title, value.value()
        );
    }

    private void addTime(HTMLDocument doc, Resource rev) {
        TextField value = doc.getSingularTextField("dtreviewed");
        conditionallyAddStringProperty(
                value.source(),
                rev, vDCTERMS.date, value.value()
        );
    }

    private void addDescription(HTMLDocument doc, Resource rev) {
        TextField value = doc.getSingularTextField("description");
        conditionallyAddStringProperty(
                value.source(),
                rev, vREVIEW.text, value.value()
        );
    }

}
TOP

Related Classes of org.apache.any23.extractor.html.HReviewExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.