/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.extractor;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
/**
* PDF Content Extractor. This will parse the text content of a PDF and apply a
* regex to search for links within the body of the text.
*
* Requires itextpdf jar: http://repo1.maven.org/maven2/com/itextpdf/itextpdf/5.5.0/itextpdf-5.5.0.jar
*
* @contributor adam
*/
public class ExtractorPDFContent extends ContentExtractor {
@SuppressWarnings("unused")
private static final long serialVersionUID = 3L;
private static final Logger LOGGER =
Logger.getLogger(ExtractorPDFContent.class.getName());
public static final Pattern URLPattern = Pattern.compile(
"(?i)\\(?(https?):\\/\\/"+ // protocol
"(([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+"+ // username
"(:([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+)?"+ // password
"@)?(?"+ // auth requires @
")((([a-z0-9]\\.|[a-z0-9][a-z0-9-]*[a-z0-9]\\.)*"+ // domain segments AND
"[a-z][a-z0-9-]*[a-z0-9]"+ // top level domain OR
"|((\\d|[1-9]\\d|1\\d{2}|2[0-4][0-9]|25[0-5])\\.){3}"+
"(\\d|[1-9]\\d|1\\d{2}|2[0-4][0-9]|25[0-5])"+ // IP address
")(:\\d+)?)"+ // port
"(((\\/+([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*"+ // path
"(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?)?)?"+ // query string
"(\\n(?!http://)"+ // possible newline (seems to happen in pdfs)
"((\\/)?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*"+ // continue possible path
"(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?"+ // or possible query
")?");
/**
* The maximum size of PDF files to consider. PDFs larger than this
* maximum will not be searched for links.
*/
{
setMaxSizeToParse(10*1024*1024L); // 10MB
}
public long getMaxSizeToParse() {
return (Long) kp.get("maxSizeToParse");
}
public void setMaxSizeToParse(long threshold) {
kp.put("maxSizeToParse",threshold);
}
public ExtractorPDFContent() {
}
protected boolean innerExtract(CrawlURI curi){
PdfReader documentReader;
ArrayList<String> uris = new ArrayList<String>();
try {
documentReader = new PdfReader(curi.getRecorder().getContentReplayInputStream());
for(int i=1; i<= documentReader.getNumberOfPages(); i++) { //Page numbers start at 1
String pageParseText = extractPageText(documentReader,i);
Matcher matcher = URLPattern.matcher(pageParseText);
while(matcher.find()) {
String prospectiveURL = pageParseText.substring(matcher.start(),matcher.end()).trim();
//handle URLs wrapped in parentheses
if(prospectiveURL.startsWith("(")) {
prospectiveURL=prospectiveURL.substring(1,prospectiveURL.length());
if(prospectiveURL.endsWith(")"))
prospectiveURL=prospectiveURL.substring(0,prospectiveURL.length()-1);
}
uris.add(prospectiveURL);
//parsetext URLs tend to end in a '.' if they are in a sentence, queue without trailing '.'
if(prospectiveURL.endsWith(".") && prospectiveURL.length()>2)
uris.add(prospectiveURL.substring(0, prospectiveURL.length()-1));
//Full regex allows newlines which seem to be common, also add match without newline in case we are wrong
if(matcher.group(19)!=null) {
String alternateURL = matcher.group(1)+"://"+(matcher.group(2)!=null?matcher.group(2):"")+matcher.group(6)+matcher.group(13);
//Again, handle URLs wrapped in parentheses
if(prospectiveURL.startsWith("(") && alternateURL.endsWith(")"))
alternateURL=alternateURL.substring(0,alternateURL.length()-1);
uris.add(alternateURL);
}
}
}
} catch (IOException e) {
curi.getNonFatalFailures().add(e);
return false;
} catch (RuntimeException e) {
curi.getNonFatalFailures().add(e);
return false;
}
if (uris.size()<1) {
return true;
}
for (String uri: uris) {
try {
LinkContext lc = LinkContext.NAVLINK_MISC;
Hop hop = Hop.NAVLINK;
CrawlURI out = curi.createCrawlURI(uri, lc, hop);
curi.getOutLinks().add(out);
} catch (URIException e1) {
logUriError(e1, curi.getUURI(), uri);
}
}
numberOfLinksExtracted.addAndGet(uris.size());
LOGGER.fine(curi+" has "+uris.size()+" links.");
// Set flag to indicate that link extraction is completed.
return true;
}
public String extractPageText(PdfReader documentReader, int pageNum){
String content ="";
PdfReaderContentParser parser = new PdfReaderContentParser(documentReader);
TextExtractionStrategy strat;
try {
strat = parser.processContent(pageNum, new SimpleTextExtractionStrategy());
content = strat.getResultantText();
} catch (IOException e) {
LOGGER.log(Level.WARNING, "Failed to parse pdf text in "
+ Thread.currentThread().getName(), e);
}
return content;
}
@Override
protected boolean shouldExtract(CrawlURI uri) {
long max = getMaxSizeToParse();
if (uri.getRecorder().getRecordedInput().getSize() > max) {
return false;
}
String ct = uri.getContentType();
return (ct != null) && (ct.startsWith("application/pdf"));
}
}