Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Response


            // which may be successful faster because of a cache hit
        }

        this.loaderSteering.put(request.url(), new Semaphore(0));
        try {
            final Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist);
            check = this.loaderSteering.remove(request.url());
            if (check != null) check.release(1000);
            return response;
        } catch (final IOException e) {
            // release the semaphore anyway
View Full Code Here


                final RequestHeader requestHeader = new RequestHeader();
                requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
                DigestURI refererURL = null;
                if (request.referrerhash() != null) refererURL = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
                if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
                final Response response = new Response(
                        request,
                        requestHeader,
                        cachedResponse,
                        "200",
                        crawlProfile,
                        content);

                // check which caching strategy shall be used
                if (cacheStrategy == CacheStrategy.IFEXIST || cacheStrategy == CacheStrategy.CACHEONLY) {
                    // well, just take the cache and don't care about freshness of the content
                    this.log.logInfo("cache hit/useall for: " + url.toNormalform(true, false));
                    return response;
                }

                // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
                assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
                if (response.isFreshForProxy()) {
                    this.log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false));
                    return response;
                } else {
                    this.log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
                }
            } else if (cachedResponse != null) {
                this.log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false));
            } else if (content != null) {
                this.log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false));
            }
        }

        // check case where we want results from the cache exclusively, and never from the internet (offline mode)
        if (cacheStrategy == CacheStrategy.CACHEONLY) {
            // we had a chance to get the content from the cache .. its over. We don't have it.
            throw new IOException("cache only strategy");
        }

        // now forget about the cache, nothing there. Try to load the content from the internet

        // check access time: this is a double-check (we checked possibly already in the balancer)
        // to make sure that we don't DoS the target by mistake
        if (!url.isLocal()) {
            final Long lastAccess = accessTime.get(host);
            long wait = 0;
            if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis());
            if (wait > 0) {
                // force a sleep here. Instead just sleep we clean up the accessTime map
                final long untilTime = System.currentTimeMillis() + wait;
                cleanupAccessTimeTable(untilTime);
                if (System.currentTimeMillis() < untilTime)
                    try {Thread.sleep(untilTime - System.currentTimeMillis());} catch (final InterruptedException ee) {}
            }
        }

        // now it's for sure that we will access the target. Remember the access time
        if (host != null) accessTime.put(host, System.currentTimeMillis());

        // load resource from the internet
        Response response = null;
        if ((protocol.equals("http") || (protocol.equals("https")))) response = this.httpLoader.load(request, maxFileSize, checkBlacklist);
        if (protocol.equals("ftp")) response = this.ftpLoader.load(request, true);
        if (protocol.equals("smb")) response = this.smbLoader.load(request, true);
        if (protocol.equals("file")) response = this.fileLoader.load(request, true);
        if (response != null && response.getContent() != null) {
            // we got something. Now check if we want to store that to the cache
            // first check looks if we want to store the content to the cache
            if (crawlProfile == null || !crawlProfile.storeHTCache()) {
                // no caching wanted. Thats ok, do not write any message
                return response;
            }
            // second check tells us if the protocoll tells us something about caching
            final String storeError = response.shallStoreCacheForCrawler();
            if (storeError == null) {
                try {
                    Cache.store(url, response.getResponseHeader(), response.getContent());
                } catch (final IOException e) {
                    this.log.logWarning("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e);
                }
            } else {
                this.log.logWarning("cannot write " + response.url() + " to Cache (4): " + storeError);
            }
            return response;
        }

        throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
View Full Code Here

     * @return the content as {@link byte[]}
     * @throws IOException
     */
    public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy) throws IOException {
        // try to download the resource using the loader
        final Response entry = load(request, cacheStrategy, false);
        if (entry == null) return null; // not found in web

        // read resource body (if it is there)
        return entry.getContent();
    }
View Full Code Here

    }

    public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final int maxFileSize) throws IOException, Parser.Failure {

        // load resource
        final Response response = load(request, cacheStrategy, maxFileSize, false);
        final DigestURI url = request.url();
        if (response == null) throw new IOException("no Response for url " + url);

        // if it is still not available, report an error
        if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);

        // parse resource
        return response.parse();
    }
View Full Code Here

    public static Map<String, String> getListFriends(final LoaderDispatcher loader) {
        final Map<String, String> map = new TreeMap<String, String>();
        Map<String, String> m;
        for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
            if (!oaiFriend.getValue().exists()) {
                final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true);
                if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
            }

            if (oaiFriend.getValue().exists()) {
                final byte[] b = FileUtils.read(oaiFriend.getValue());
                if (b != null) {
View Full Code Here

        return response.parse();
    }

    public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
        // load page
        final Response r = this.load(request(location, true, false), cachePolicy, false);
        final byte[] page = (r == null) ? null : r.getContent();
        if (page == null) throw new IOException("no response from url " + location.toString());

        try {
          return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
        } catch(final Parser.Failure e) {
          throw new IOException(e.getMessage());
        }
    }
View Full Code Here

     * @param cacheStrategy the cache strategy
     * @return a map from URLs to the anchor texts of the urls
     * @throws IOException
     */
    public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy) throws IOException {
        final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, false);
        if (response == null) throw new IOException("response == null");
        final ResponseHeader responseHeader = response.getResponseHeader();
        if (response.getContent() == null) throw new IOException("resource == null");
        if (responseHeader == null) throw new IOException("responseHeader == null");

        Document[] documents = null;
        final String supportError = TextParser.supports(url, responseHeader.mime());
        if (supportError != null) throw new IOException("no parser support: " + supportError);
        try {
            documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent(), false);
            if (documents == null) throw new IOException("document == null");
        } catch (final Exception e) {
            throw new IOException("parser error: " + e.getMessage());
        }

View Full Code Here

        public void run() {
            if (this.cache != null && this.cache.exists()) return;
            try {
                // load from the net
                final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, true);
                final byte[] b = response.getContent();
                if (this.cache != null) FileUtils.copy(b, this.cache);
            } catch (final MalformedURLException e) {} catch (final IOException e) {}
        }
View Full Code Here

                    if (profile == null) {
                        this.log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
                        return true;
                    }
                    try {
                        this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(PROCESS, new Response(urlEntry, profile), null, null));
                        Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false));
                    } catch (final InterruptedException e) {
                        Log.logException(e);
                    }
                    return true;
View Full Code Here

                    // load a resource and push queue entry to switchboard queue
                    // returns null if everything went fine, a fail reason string if a problem occurred
                    try {
                        this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
                        final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
                        final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), true);
                        if (response == null) {
                            this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
                            if (CrawlQueues.this.log.isFine()) CrawlQueues.this.log.logFine("problem loading " + this.request.url().toString() + ": no content (possibly caused by cache policy)");
                            result = "no content (possibly caused by cache policy)";
                        } else {
View Full Code Here

TOP

Related Classes of de.anomic.crawler.retrieval.Response

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.