/*
* Copyright 2010 Peter Karich jetwick_@_pannous_._info
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.util;
import com.google.inject.Inject;
import de.jetwick.tw.*;
import de.jetwick.data.JTweet;
import de.jetwick.data.UrlEntry;
import de.jetwick.es.ElasticTweetSearch;
import de.jetwick.snacktory.HtmlFetcher;
import de.jetwick.snacktory.JResult;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.elasticsearch.common.cache.CacheBuilder;
import org.elasticsearch.common.collect.MapMaker;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class takes the urls from article index and resolves them. Additionally
* and more importantly it stores the text and title into article index.
*
* @author Peter Karich, jetwick_@_pannous_._info
*/
public class GenericUrlResolver extends MyThread implements AnyExecutor<JTweet> {
private final Logger logger = LoggerFactory.getLogger(getClass());
private int resolveThreads = 5;
private int resolveTimeout = 500;
private ExecutorService service;
private long testWait = -1;
protected BlockingQueue<JTweet> resolverQueue;
@Inject
private ElasticTweetSearch tweetSearch;
private UrlTitleCleaner urlTitleCleaner = new UrlTitleCleaner();
@Inject
private HtmlFetcher fetcher;
private final Map<String, JTweet> unresolvedCache;
private final Map<String, Object> tooOldMap;
private static final Object OBJECT = new Object();
private AtomicInteger counter = new AtomicInteger(0);
private AtomicInteger emptyTitleCounter = new AtomicInteger(0);
private AtomicLong start = new AtomicLong(System.nanoTime());
// public GenericUrlResolver() {
// this(600);
// }
public GenericUrlResolver(int queueSize) {
super("generic-url-resolver");
unresolvedCache = createGenericCache(5000, 24 * 60);
tooOldMap = createGenericCache(500, 24 * 60);
resolverQueue = new LinkedBlockingQueue<JTweet>(queueSize);
}
public static <K, V> Map<K, V> createGenericCache(int count, int minutes) {
// do NOT use .softKeys() otherwise we will get == comparison which
// is bad for 'new Long'
return (ConcurrentMap<K, V>) CacheBuilder.newBuilder().concurrencyLevel(20).maximumSize(count).
expireAfterAccess(minutes, TimeUnit.MINUTES).build().asMap();
}
public GenericUrlResolver setHtmlFetcher(HtmlFetcher fetcher) {
this.fetcher = fetcher;
return this;
}
public GenericUrlResolver setTest(long testWait) {
this.testWait = testWait;
return this;
}
public void setResolveTimeout(int resolveTimeout) {
this.resolveTimeout = resolveTimeout;
}
public int getResolveTimeout() {
return resolveTimeout;
}
public GenericUrlResolver setResolveThreads(int resolveThreads) {
this.resolveThreads = resolveThreads;
return this;
}
public ExecutorService getService() {
if (service == null)
service = Executors.newFixedThreadPool(resolveThreads);
return service;
}
public BlockingQueue<JTweet> getInputQueue() {
return resolverQueue;
}
JTweet findUrlInCache(String url) {
return unresolvedCache.get(url);
}
int getUnresolvedSize() {
return unresolvedCache.size();
}
@Override
public void run() {
Collection<Callable<Object>> workerCollection = new ArrayList<Callable<Object>>(resolveThreads);
for (int i = 0; i < resolveThreads; i++) {
final int tmp = i;
workerCollection.add(new Callable() {
@Override
public Object call() throws Exception {
try {
while (true) {
if (!executeResolve(tmp))
break;
}
logger.info(getName() + " stopped");
} catch (Throwable ex) {
logger.error("url resolver " + tmp + "died", ex);
}
return null;
}
});
}
try {
if (testWait > 0)
getService().invokeAll(workerCollection, testWait, TimeUnit.MILLISECONDS);
else
getService().invokeAll(workerCollection);
logger.warn("FINISHED " + getName() + " testWait:" + testWait);
} catch (InterruptedException ex) {
logger.info(getName() + " was interrupted:" + ex.getMessage());
}
}
public void queueObject(JTweet tw) {
// if tweet is persistent we need to queue it
boolean directlyQueueIt = false;
String url = tw.getUrl();
if (tweetSearch.tooOld(tw.getCreatedAt())) {
tooOldMap.put(url, OBJECT);
unresolvedCache.remove(url);
directlyQueueIt = true;
} else {
if (Helper.isEmpty(url))
tweetSearch.queueObject(tw);
else if (tooOldMap.containsKey(url)) {
logger.warn("(2) Skipped too old tweet: " + url);
directlyQueueIt = true;
} else {
putObject(tw);
}
}
if (!directlyQueueIt && tw.isPersistent())
tweetSearch.queueObject(tw);
}
void putObject(JTweet tw) {
if (isTweetInIndex(tw)) {
// no need to queue again to aindex as we queue if article already exists on every resolve
unresolvedCache.remove(tw.getUrl());
canRemoveOrigUrl(tw);
tweetSearch.queueObject(tw);
} else {
if (canRemoveOrigUrl(tw)) {
tweetSearch.queueObject(tw);
return;
}
String url = tw.getUrl();
boolean alreadyExistent = false;
for (int i = 0; i < 2; i++) {
JTweet old = unresolvedCache.put(url, tw);
if (old != null) {
if (tw.getTwitterId() == old.getTwitterId())
tw.updateFrom(old);
tweetSearch.queueObject(tw);
alreadyExistent = true;
break;
}
String tmp = getFirstOrigUrl(tw);
if (Helper.isEmpty(tmp) || tmp.equals(url))
break;
url = tmp;
// try again for original url
}
if (!alreadyExistent)
try {
resolverQueue.put(tw);
} catch (InterruptedException ex) {
logger.error("Couldn't put article:" + tw.getUrl(), ex);
}
}
}
private String getFirstOrigUrl(JTweet tw) {
if (tw.getUrlEntries().size() > 0)
return tw.getUrlEntries().iterator().next().getOriginalUrl(tw);
return null;
}
public boolean executeResolve(final int thread) {
JTweet tweet = null;
try {
tweet = resolverQueue.take();
} catch (Exception ex) {
if (thread == 0)
logger.warn("url resolver " + thread + " died " + ex.getMessage());
return false;
}
String origUrl = tweet.getUrl();
String url = origUrl;
try {
boolean doFetch = true;
String resUrl = fetcher.getResolvedUrl(url, resolveTimeout);
if (!Helper.isEmpty(resUrl) && resUrl.length() > url.length()) {
url = resUrl;
// check if resolved url already exists
if (exists(resUrl)) {
unresolvedCache.remove(resUrl);
doFetch = false;
}
}
if (doFetch) {
JResult res = fetcher.fetchAndExtract(url, resolveTimeout, false);
// set resolved url
if (tweet.getUrlEntries().size() > 0) {
UrlEntry ue = tweet.getUrlEntries().iterator().next();
ue.setResolvedUrl(res.getUrl());
ue.setResolvedTitle(res.getTitle());
ue.setResolvedSnippet(res.getText());
ue.setResolvedDomain(Helper.extractDomain(url));
}
if (urlTitleCleaner.contains(res.getTitle()))
tweet.setQuality(20);
if (res.getTitle().isEmpty())
emptyTitleCounter.addAndGet(1);
counter.addAndGet(1);
if (thread < 3) {
float secs = (System.nanoTime() - start.get()) / 1e+9f;
logger.info(thread + "| " + counter.get() / secs + " entries/sec"//, secs:" + secs
+ ", feeded:" + counter
+ ", resolverQueue.size:" + resolverQueue.size()
+ ", unresolved.size:" + unresolvedCache.size()
+ ", tooOld.size:" + tooOldMap.size()
+ ", empty titles:" + emptyTitleCounter);
}
}
} catch (Exception ex) {
//logger.info("Error while resolveAndFetch url:" + art.getUrl() + " Error:" + Helper.getMsg(ex));
tweet.setQuality(Math.round(tweet.getQuality() * 0.8f));
} finally {
// always feed the article even if there was an error
tweetSearch.queueObject(tweet);
// real time get ensures that we have at least the url in aindex (not so for origURL!)
unresolvedCache.remove(tweet.getUrl());
// DISABLED for now as
// if (!checkAgainQueue.offer(art))
// logger.error("checkAgainQueue full. Skipped:" + art.getUrl());
}
return true;
}
boolean isTweetInIndex(JTweet tw) {
JTweet existing = tweetSearch.findByTwitterId(tw.getTwitterId());
if (existing != null)
return true;
return exists(tw.getUrl());
}
boolean canRemoveOrigUrl(JTweet tw) {
boolean remove = false;
for (UrlEntry as : tw.getUrlEntries()) {
String oUrl = as.getOriginalUrl(tw);
// is original url already in index?
if (oUrl != null && exists(oUrl)) {
unresolvedCache.remove(oUrl);
remove = true;
}
}
return remove;
}
boolean exists(String url) {
return !tweetSearch.findByUrl(url).isEmpty();
}
@Override
public JTweet execute(JTweet tweet) {
queueObject(tweet);
return tweet;
}
public void setTweetSearch(ElasticTweetSearch tweetSearch) {
this.tweetSearch = tweetSearch;
}
public ElasticTweetSearch getTweetSearch() {
return tweetSearch;
}
}