* @date 2013-1-7 上午11:08:54
* @param toFetchURL
* @return
*/
public FetchResult fetch(FetchRequest req) throws Exception{
FetchResult fetchResult = new FetchResult();
HttpGet get = null;
HttpEntity entity = null;
String toFetchURL = req.getUrl();
try {
get = new HttpGet(toFetchURL);
//设置请求GZIP压缩,注意,前面必须设置GZIP解压缩处理
get.addHeader("Accept-Encoding", "gzip");
for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext();){
Entry<String, String> entry = it.next();
get.setHeader(entry.getKey(), entry.getValue());
}
//同步信号量,在真正对服务端进行访问之前进行访问间隔的控制
// TODO 针对每个请求有一个delay的参数设置
synchronized (mutex) {
//获取当前时间
long now = (new Date()).getTime();
//对同一个Host抓取时间间隔进行控制,若在设置的时限内则进行休眠
if (now - lastFetchTime < config.getPolitenessDelay())
Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
//不断更新最后的抓取时间,注意,是针对HOST的,不是针对某个URL的
lastFetchTime = (new Date()).getTime();
}
//记录get请求信息
Header[] headers = get.getAllHeaders();
for (Header h : headers){
Map<String, List<String>> hs = req.getHeaders();
String key = h.getName();
List<String> val = hs.get(key);
if (val == null)
val = new ArrayList<String>();
val.add(h.getValue());
hs.put(key, val);
}
fetchResult.setReq(req);
//执行get访问,获取服务端返回内容
HttpResponse response = httpClient.execute(get);
headers = response.getAllHeaders();
for (Header h : headers){
Map<String, List<String>> hs = fetchResult.getHeaders();
String key = h.getName();
List<String> val = hs.get(key);
if (val == null)
val = new ArrayList<String>();
val.add(h.getValue());
hs.put(key, val);
}
//设置已访问URL
fetchResult.setFetchedUrl(toFetchURL);
String uri = get.getURI().toString();
if (!uri.equals(toFetchURL))
if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL))
fetchResult.setFetchedUrl(uri);
entity = response.getEntity();
//服务端返回的状态码
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != HttpStatus.SC_OK) {
if (statusCode != HttpStatus.SC_NOT_FOUND) {
Header locationHeader = response.getFirstHeader("Location");
//如果是301、302跳转,获取跳转URL即可返回
if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY))
fetchResult.setMovedToUrl(URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL));
}
//只要不是OK的除了设置跳转URL外设置statusCode即可返回
//判断是否有忽略状态码的设置
if (this.site.getSkipStatusCode() != null && this.site.getSkipStatusCode().trim().length() > 0){
String[] scs = this.site.getSkipStatusCode().split(",");
for (String code : scs){
int c = CommonUtil.toInt(code);
//忽略此状态码,依然解析entity
if (statusCode == c){
assemPage(fetchResult, entity);
break;
}
}
}
fetchResult.setStatusCode(statusCode);
return fetchResult;
}
//处理服务端返回的实体内容
if (entity != null) {
fetchResult.setStatusCode(statusCode);
assemPage(fetchResult, entity);
return fetchResult;
}
} catch (Throwable e) {
e.printStackTrace();
fetchResult.setFetchedUrl(e.toString());
fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal());
return fetchResult;
} finally {
try {
if (entity == null && get != null)
get.abort();
} catch (Exception e) {
throw e;
}
}
fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal());
return fetchResult;
}