/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.examples.webmining;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.scaleunlimited.cascading.NullContext;
import bixo.config.BixoPlatform;
import bixo.datum.Outlink;
import bixo.datum.ParsedDatum;
import bixo.examples.crawl.SimpleBodyContentHandler;
import bixo.parser.DOMParser;
import cascading.flow.FlowProcess;
import cascading.operation.OperationCall;
import cascading.tuple.TupleEntryCollector;
@SuppressWarnings("serial")
public class AnalyzeHtml extends DOMParser {
private static final Logger LOGGER = LoggerFactory.getLogger(AnalyzeHtml.class);
private static final Pattern IMG_SUFFIX_EXCLUSION_PATTERN =
Pattern.compile("(?i)\\.(gif|jpg|jpeg|bmp|png|ico)$");
private static final int MAX_WORDS_PER_PHRASE = 2;
// These are all transient since we don't want to serialize them when the
// Cascading job is submitted, so we set them up in the prepare() method.
private transient PhraseShingleAnalyzer _analyzer;
private transient Set<String> _positivePhrases;
private transient Set<String> _negativePhrases;
private transient AnalyzedDatum _result;
public AnalyzeHtml() {
super(AnalyzedDatum.FIELDS);
}
@SuppressWarnings("rawtypes")
@Override
public void prepare(FlowProcess process, OperationCall<NullContext> opCall) {
super.prepare(process, opCall);
// Load the positive and negative phrases.
// Analyze them using the standard analyzer (no stopwords)
// TODO Maybe figure out the max # of words, for shingling? For now use a constant.
_analyzer = new PhraseShingleAnalyzer(MAX_WORDS_PER_PHRASE);
_positivePhrases = loadAnalyzedPhrases("/positive-phrases.txt", _analyzer);
_negativePhrases = loadAnalyzedPhrases("/negative-phrases.txt", _analyzer);
_result = new AnalyzedDatum("", 0.0f, new PageResult[0], new Outlink[0]);
}
@SuppressWarnings("rawtypes")
@Override
protected void process(ParsedDatum datum, Document doc, TupleEntryCollector collector,
FlowProcess process) throws Exception {
SimpleBodyContentHandler bodyContentHandler = new SimpleBodyContentHandler();
SAXWriter writer = new SAXWriter(bodyContentHandler);
writer.write(doc);
float pageScore = getScore(bodyContentHandler.toString());
// Get the outlinks.
Outlink[] outlinks = getOutlinks(doc);
// Extract all of the images, and use them as page results.
PageResult[] pageResults = extractImages(datum.getUrl(), doc, outlinks);
_result.setUrl(datum.getUrl());
_result.setPageScore(pageScore);
_result.setOutlinks(outlinks);
_result.setPageResults(pageResults);
collector.add(BixoPlatform.clone(_result.getTuple(), process));
}
@Override
protected void handleException(ParsedDatum datum, Exception e, TupleEntryCollector collector) {
// We'll just log it here, though normally we'd want to rethrow the exception, and
// have our workflow set up to trap it.
LOGGER.error("Exception parsing/processing " + datum.getUrl(), e);
}
private Outlink[] getOutlinks(Document doc) {
ArrayList<Outlink> outlinkList = new ArrayList<Outlink>();
List<Node> aNodes = getNodes(doc, "//a");
for (Node node : aNodes) {
String url = getAttributeFromNode(node, "href");
String anchor = getAttributeFromNode(node, "name");
String rel = getAttributeFromNode(node, "rel");
Outlink link = new Outlink(url, anchor, rel);
outlinkList.add(link);
}
return outlinkList.toArray(new Outlink[outlinkList.size()]);
}
private PageResult[] extractImages(String sourceUrl, Document doc, Outlink[] outlinks) {
ArrayList<PageResult> pageResults = new ArrayList<PageResult>();
// Find if we have image links that may have extracted as an Outlink
for (Outlink outlink : outlinks) {
String outlinkUrl = outlink.getToUrl();
if (isImgSuffix(outlinkUrl)) {
// TODO Maybe set description to any words found in image name? Change '-' and '_' to spaces?
PageResult result = new PageResult(sourceUrl, outlinkUrl, "");
pageResults.add(result);
}
}
// Next extract all img
List<Node> imgNodes = getNodes(doc, "//img");
for (Node node : imgNodes) {
String src = getAttributeFromNode(node, "src");
String alt = getAttributeFromNode(node, "alt");
PageResult result = new PageResult(sourceUrl, src, alt);
pageResults.add(result);
}
return pageResults.toArray(new PageResult[pageResults.size()]);
}
private String getAttributeFromNode(Node node, String attribute) {
String attributeValue = null;
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element e = (Element)node;
attributeValue = e.attributeValue(attribute);
}
return (attributeValue == null ? "" : attributeValue);
}
/**
* Utility routine to get back a list of nodes from the HTML page document,
* which match the provided XPath expression.
*
* @param xPath expression to match
* @return array of matching nodes, or an empty array if nothing matches
* @throws ExtractionException
*/
@SuppressWarnings("unchecked")
private List<Node> getNodes(Node node, String xPath) {
List<Node> result = node.selectNodes(xPath);
if (result == null) {
result = new ArrayList<Node>();
}
return result;
}
private static boolean isImgSuffix(String url) {
Matcher m = IMG_SUFFIX_EXCLUSION_PATTERN.matcher(url);
if (m.find()) {
return true;
}
return false;
}
/* Calculate the positive term ratio (positive term count/total term count)
* Do the same thing for the negative terms.
* The score is the positive ratio - the negative ratio
*/
private float getScore(String extractedContent) throws Exception {
List<String> allTerms = _analyzer.getTermList(extractedContent);
int positiveCount = 0;
int negativeCount = 0;
int neutralCount = 0;
for (String term : allTerms) {
if (_positivePhrases.contains(term)) {
positiveCount += 1;
} else if (_negativePhrases.contains(term)) {
negativeCount += 1;
} else {
neutralCount += 1;
}
}
float totalCount = (float)(positiveCount + negativeCount + neutralCount);
float positiveRatio = 0;
float negativeRatio = 0;
if (totalCount > 0) {
positiveRatio = (float)positiveCount/totalCount;
negativeRatio = (float)negativeCount/totalCount;
}
return positiveRatio - negativeRatio;
}
private Set<String> loadAnalyzedPhrases(String fileName, PhraseShingleAnalyzer analyzer) {
InputStream is = AnalyzeHtml.class.getResourceAsStream(fileName);
Set<String> result = new HashSet<String>();
try {
List<String> lines = IOUtils.readLines(is);
for (String line : lines) {
if (line.trim().startsWith("#")) {
continue;
}
String analyzedPhrase = _analyzer.getAnalyzedPhrase(line);
result.add(analyzedPhrase);
}
} catch (Exception e) {
throw new RuntimeException("Error loading file:" + fileName, e);
}
return result;
}
}