/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.microdata;
import org.apache.any23.extractor.html.DomUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.TreeWalker;
import java.io.PrintStream;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* This class provides utility methods for handling <b>Microdata</b>
* nodes contained within a <i>DOM</i> document.
*
* @author Michele Mostarda (mostarda@fbk.eu)
*/
public class MicrodataParser {
enum ErrorMode {
/** This mode raises an exception at first encountered error. */
StopAtFirstError,
/** This mode produces a full error report. */
FullReport
}
public static final String ITEMSCOPE_ATTRIBUTE = "itemscope";
public static final String ITEMPROP_ATTRIBUTE = "itemprop";
/**
* List of tags providing the <code>src</code> property.
*/
public static final Set<String> SRC_TAGS = Collections.unmodifiableSet(
new HashSet<String>( Arrays.asList("audio", "embed", "iframe", "img", "source", "track", "video") )
);
/**
* List of tags providing the <code>href</code> property.
*/
public static final Set<String> HREF_TAGS = Collections.unmodifiableSet(
new HashSet<String>( Arrays.asList("a", "area", "link") )
);
private final Document document;
/**
* This set holds the name of properties being dereferenced.
* The {@link #deferProperties(String...)} checks first if the
* required dereference has been already asked, if so raises
* a loop detection error. This map works in coordination
* with {@link #dereferenceRecursionCounter}, so that at the end of
* {@link #deferProperties(String...)} call recursion the
* {@link #loopDetectorSet} can be cleaned up.
*/
private final Set<String> loopDetectorSet = new HashSet<String>();
/**
* {@link ItemScope} cache.
*/
private final Map<Node,ItemScope> itemScopes = new HashMap<Node,ItemScope>();
/**
* {@link ItemPropValue} cache.
*/
private final Map<Node, ItemPropValue> itemPropValues = new HashMap<Node, ItemPropValue>();
/**
* Counts the recursive call of {@link #deferProperties(String...)}.
* It helps to cleanup the {@link #loopDetectorSet} when recursion ends.
*/
private int dereferenceRecursionCounter = 0;
/**
* Current error mode.
*/
private ErrorMode errorMode = ErrorMode.FullReport;
/**
* List of collected errors. Used when {@link #errorMode} <code>==</code> {@link ErrorMode#FullReport}.
*/
private List<MicrodataParserException> errors = new ArrayList<MicrodataParserException>();
/**
* Returns all the <i>itemScope</i>s detected within the given root node.
*
* @param node root node to search in.
* @return list of detected items.
*/
public static List<Node> getItemScopeNodes(Node node) {
return DomUtils.findAllByAttributeName(node, ITEMSCOPE_ATTRIBUTE);
}
/**
* Check whether a node is an <i>itemScope</i>.
*
* @param node node to check.
* @return <code>true</code> if the node is an <i>itemScope</i>., <code>false</code> otherwise.
*/
public static boolean isItemScope(Node node) {
return DomUtils.readAttribute(node, ITEMSCOPE_ATTRIBUTE, null) != null;
}
/**
* Returns all the <i>itemProp</i>s detected within the given root node.
*
* @param node root node to search in.
* @return list of detected items.
*/
public static List<Node> getItemPropNodes(Node node) {
return DomUtils.findAllByAttributeName(node, ITEMPROP_ATTRIBUTE);
}
/**
* Check whether a node is an <i>itemProp</i>.
*
* @param node node to check.
* @return <code>true</code> if the node is an <i>itemProp</i>., <code>false</code> otherwise.
*/
public static boolean isItemProp(Node node) {
return DomUtils.readAttribute(node, ITEMPROP_ATTRIBUTE, null) != null;
}
/**
* Returns only the <i>itemScope<i>s that are top level items.
*
* @param node root node to search in.
* @return list of detected top item scopes.
*/
public static List<Node> getTopLevelItemScopeNodes(Node node) {
final List<Node> itemScopes = getItemScopeNodes(node);
final List<Node> topLevelItemScopes = new ArrayList<Node>();
for(Node itemScope : itemScopes) {
if( ! isItemProp(itemScope) ) {
topLevelItemScopes.add(itemScope);
}
}
return getUnnestedNodes( topLevelItemScopes );
}
/**
* Returns all the <b>Microdata items</b> detected within the given <code>document</code>.
*
* @param document document to be processed.
* @param errorMode error management policy.
* @return list of <b>itemscope</b> items.
* @throws MicrodataParserException if
* <code>errorMode == {@link org.apache.any23.extractor.microdata.MicrodataParser.ErrorMode#StopAtFirstError}</code>
* and an error occurs.
*/
public static MicrodataParserReport getMicrodata(Document document, ErrorMode errorMode)
throws MicrodataParserException {
final List<Node> itemNodes = getTopLevelItemScopeNodes(document);
final List<ItemScope> items = new ArrayList<ItemScope>();
final MicrodataParser microdataParser = new MicrodataParser(document);
microdataParser.setErrorMode(errorMode);
for(Node itemNode : itemNodes) {
items.add( microdataParser.getItemScope(itemNode) );
}
return new MicrodataParserReport(
items.toArray( new ItemScope[items.size()] ),
microdataParser.getErrors()
);
}
/**
* Returns all the <b>Microdata items</b> detected within the given <code>document</code>,
* works in full report mode.
*
* @param document document to be processed.
* @return list of <b>itemscope</b> items.
*/
public static MicrodataParserReport getMicrodata(Document document) {
try {
return getMicrodata(document, ErrorMode.FullReport);
} catch (MicrodataParserException mpe) {
throw new IllegalStateException("Unexpected exception.", mpe);
}
}
/**
* Returns a <i>JSON</i> containing the list of all extracted Microdata,
* as described at <a href="http://www.w3.org/TR/microdata/#json">Microdata JSON Specification</a>.
*
* @param document document to be processed.
* @param ps
*/
public static void getMicrodataAsJSON(Document document, PrintStream ps) {
final MicrodataParserReport report = getMicrodata(document);
final ItemScope[] itemScopes = report.getDetectedItemScopes();
final MicrodataParserException[] errors = report.getErrors();
ps.append("{ ");
// Results.
ps.append("\"result\" : [");
for(int i = 0; i < itemScopes.length; i++) {
ps.print( itemScopes[i].toJSON() );
if( i < itemScopes.length - 1 ) {
ps.print(", ");
}
}
ps.append("] ");
// Errors.
if(errors != null && errors.length > 0) {
ps.append(", ");
ps.append("\"errors\" : [");
for (int i = 0; i < errors.length; i++) {
ps.print( errors[i].toJSON() );
if (i < itemScopes.length - 1) {
ps.print(", ");
}
}
ps.append("] ");
}
ps.append("}");
}
/**
* Returns only nodes that are <b>not</b> nested one each other.
*
* @param candidates list of candidate nodes.
* @return list of unnested nodes.
*/
private static List<Node> getUnnestedNodes(List<Node> candidates) {
final List<Node> unnesteds = new ArrayList<Node>();
for(int i = 0; i < candidates.size(); i++) {
boolean skip = false;
for(int j = 0; j < candidates.size(); j++) {
if(i == j) continue;
if( DomUtils.isAncestorOf(candidates.get(j), candidates.get(i), true) ) {
skip = true;
break;
}
}
if(!skip) {
unnesteds.add( candidates.get(i) );
}
}
return unnesteds;
}
public MicrodataParser(Document document) {
if(document == null) {
throw new NullPointerException("Document cannot be null.");
}
this.document = document;
}
public void setErrorMode(ErrorMode errorMode) {
if(errorMode == null) throw new IllegalArgumentException("errorMode must be not null.");
this.errorMode = errorMode;
}
public ErrorMode getErrorMode() {
return this.errorMode;
}
public MicrodataParserException[] getErrors() {
return errors == null
?
new MicrodataParserException[0]
:
errors.toArray( new MicrodataParserException[errors.size()] );
}
/**
* Reads the value of a <b>itemprop</code> node.
*
* @param node itemprop node.
* @return value detected within the given <code>node</code>.
* @throws MicrodataParserException if an error occurs while extracting a nested item scope.
*/
public ItemPropValue getPropertyValue(Node node) throws MicrodataParserException {
final ItemPropValue itemPropValue = itemPropValues.get(node);
if(itemPropValue != null) return itemPropValue;
final String nodeName = node.getNodeName().toLowerCase();
if ("meta".equals(nodeName)) {
return new ItemPropValue(DomUtils.readAttribute(node, "content"), ItemPropValue.Type.Plain);
}
if( SRC_TAGS.contains(nodeName) ) {
return new ItemPropValue( DomUtils.readAttribute(node, "src"), ItemPropValue.Type.Link);
}
if( HREF_TAGS.contains(nodeName) ) {
return new ItemPropValue( DomUtils.readAttribute(node, "href"), ItemPropValue.Type.Link);
}
if( "object".equals(nodeName) ) {
return new ItemPropValue( DomUtils.readAttribute(node, "data"), ItemPropValue.Type.Link);
}
if( "time".equals(nodeName) ) {
final String dateTimeStr = DomUtils.readAttribute(node, "datetime");
final Date dateTime;
try {
dateTime = ItemPropValue.parseDateTime(dateTimeStr);
} catch (ParseException pe) {
throw new MicrodataParserException(
String.format("Invalid format for datetime '%s'", dateTimeStr),
node
);
}
return new ItemPropValue(dateTime, ItemPropValue.Type.Date);
}
if( isItemScope(node) ) {
return new ItemPropValue( getItemScope(node), ItemPropValue.Type.Nested );
}
final ItemPropValue newItemPropValue = new ItemPropValue( node.getTextContent(), ItemPropValue.Type.Plain);
itemPropValues.put(node, newItemPropValue);
return newItemPropValue;
}
/**
* Returns all the <b>itemprop</b>s for the given <b>itemscope</b> node.
*
* @param scopeNode node representing the <b>itemscope</>
* @param skipRoot if <code>true</code> the given root <code>node</node>
* will be not read as a property, even if it contains the <b>itemprop</b> attribute.
* @return the list of <b>itemprop<b>s detected within the given <b>itemscope</b>.
* @throws MicrodataParserException if an error occurs while retrieving an property value.
*/
public List<ItemProp> getItemProps(final Node scopeNode, boolean skipRoot) throws MicrodataParserException {
final Set<Node> accepted = new LinkedHashSet<Node>();
if (!skipRoot) {
NamedNodeMap attributes = scopeNode.getAttributes();
if (attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null) {
accepted.add(scopeNode);
}
}
// TreeWalker to walk DOM tree starting with the scopeNode. Nodes maybe visited multiple times.
TreeWalker treeWalker = ((DocumentTraversal) scopeNode.getOwnerDocument())
.createTreeWalker(scopeNode, NodeFilter.SHOW_ELEMENT, new NodeFilter() {
@Override
public short acceptNode(Node node) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
NamedNodeMap attributes = node.getAttributes();
if (attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null && !scopeNode.equals(node)) {
accepted.add(node);
}
if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
// Don't visit descendants of nodes that define a new scope
return FILTER_REJECT;
}
}
return FILTER_ACCEPT;
}
}, false);
// To populate accepted we only need to walk the tree.
while (treeWalker.nextNode() != null);
final List<ItemProp> result = new ArrayList<ItemProp>();
for(Node itemPropNode : accepted) {
final String itemProp = DomUtils.readAttribute(itemPropNode, ITEMPROP_ATTRIBUTE, null);
final String[] propertyNames = itemProp.split(" ");
ItemPropValue itemPropValue;
for (String propertyName : propertyNames) {
try {
itemPropValue = getPropertyValue(itemPropNode);
} catch (MicrodataParserException mpe) {
manageError(mpe);
continue;
}
result.add(
new ItemProp(
DomUtils.getXPathForNode(itemPropNode),
propertyName,
itemPropValue
)
);
}
}
return result;
}
/**
* Given a document and a list of <b>itemprop</b> names this method will return
* such <b>itemprops</b>.
*
* @param refs list of references.
* @return list of retrieved <b>itemprop</b>s.
* @throws MicrodataParserException if a loop is detected or a property name is missing.
*/
public ItemProp[] deferProperties(String... refs) throws MicrodataParserException {
dereferenceRecursionCounter++;
final List<ItemProp> result = new ArrayList<ItemProp>();
try {
for (String ref : refs) {
if (loopDetectorSet.contains(ref)) {
throw new MicrodataParserException(
String.format(
"Loop detected with depth %d while dereferencing itemProp '%s' .",
dereferenceRecursionCounter - 1, ref
),
null
);
}
loopDetectorSet.add(ref);
final Element element = document.getElementById(ref);
if (element == null) {
manageError(
new MicrodataParserException( String.format("Unknown itemProp id '%s'", ref ), null )
);
continue;
}
result.addAll(getItemProps(element, false));
}
} catch (MicrodataParserException mpe) {
if(dereferenceRecursionCounter == 1)
manageError(mpe); else throw mpe; // Recursion end, this the the top call.
} finally {
dereferenceRecursionCounter--;
if(dereferenceRecursionCounter == 0) { // Recursion end, this the the top call.
loopDetectorSet.clear();
}
}
return result.toArray( new ItemProp[result.size()] );
}
/**
* Returns the {@link ItemScope} instance described within the specified <code>node</code>.
*
* @param node node describing an <i>itemscope</i>.
* @return instance of ItemScope object.
* @throws MicrodataParserException if an error occurs while dereferencing properties.
*/
public ItemScope getItemScope(Node node) throws MicrodataParserException {
final ItemScope itemScope = itemScopes.get(node);
if(itemScope != null) return itemScope;
final String id = DomUtils.readAttribute(node, "id" , null);
final String itemref = DomUtils.readAttribute(node, "itemref" , null);
final String itemType = DomUtils.readAttribute(node, "itemtype", null);
final String itemId = DomUtils.readAttribute(node, "itemid" , null);
final List<ItemProp> itemProps = getItemProps(node, true);
final String[] itemrefIDs = itemref == null ? new String[0] : itemref.split(" ");
final ItemProp[] deferredProperties;
try {
deferredProperties = deferProperties(itemrefIDs);
} catch (MicrodataParserException mpe) {
mpe.setErrorNode(node);
throw mpe;
}
for(ItemProp deferredProperty : deferredProperties) {
if( itemProps.contains(deferredProperty) ) {
manageError(
new MicrodataParserException(
String.format("Duplicated deferred itemProp '%s'.", deferredProperty.getName() ),
node
)
);
continue;
}
itemProps.add(deferredProperty);
}
final ItemScope newItemScope = new ItemScope(
DomUtils.getXPathForNode(node),
itemProps.toArray(new ItemProp[itemProps.size()]),
id,
itemrefIDs,
itemType,
itemId
);
itemScopes.put(node, newItemScope);
return newItemScope;
}
private void manageError(MicrodataParserException mpe) throws MicrodataParserException {
if(errorMode == ErrorMode.StopAtFirstError) {
throw mpe;
}
if(errorMode != ErrorMode.FullReport) throw new IllegalStateException("Unsupported mode " + errorMode);
if(errors == null) {
errors = new ArrayList<MicrodataParserException>();
}
errors.add(mpe);
}
}