Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Response


    public void processSurrogate(final InputStream is, final String name) throws IOException {
        final SurrogateReader reader = new SurrogateReader(is, 100);
        final Thread readerThread = new Thread(reader, name);
        readerThread.start();
        DCEntry surrogate;
        Response response;
        while ((surrogate = reader.take()) != DCEntry.poison) {
            // check if url is in accepted domain
            assert surrogate != null;
            assert this.crawlStacker != null;
            final String urlRejectReason = this.crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier(true));
            if (urlRejectReason != null) {
                this.log.logWarning("Rejected URL '" + surrogate.getIdentifier(true) + "': " + urlRejectReason);
                continue;
            }

            // create a queue entry
            final Document document = surrogate.document();
            final Request request = new Request(
                    ASCII.getBytes(this.peers.mySeed().hash),
                    surrogate.getIdentifier(true),
                    null,
                    "",
                    surrogate.getDate(),
                    this.crawler.defaultSurrogateProfile.handle(),
                    0,
                    0,
                    0,
                    0
            );
            response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
            final indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[]{document}, null);

            // place the queue entry into the concurrent process of the condenser (document analysis)
            try {
                this.indexingCondensementProcessor.enQueue(queueEntry);
View Full Code Here


        }
        new Thread() {
            @Override
            public void run() {
                try {
                    final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE, true);
                    if (response == null) {
                        throw new IOException("response == null");
                    }
                    if (response.getContent() == null) {
                        throw new IOException("content == null");
                    }
                    if (response.getResponseHeader() == null) {
                        throw new IOException("header == null");
                    }
                    final Document[] documents = response.parse();
                    if (documents != null) {
                        for (final Document document: documents) {
                            if (document.indexingDenied()) {
                                throw new Parser.Failure("indexing is denied", url);
                            }
View Full Code Here

                }

                // if we have an url then try to load the rss
                RSSReader rss = null;
                try {
                    final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
                    final byte[] resource = (response == null) ? null : response.getContent();
                    //System.out.println("BLEKKO: " + UTF8.String(resource));
                    rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
                } catch (final IOException e) {
                    Log.logException(e);
                }
View Full Code Here

    public static Map<String, String> getListFriends(final LoaderDispatcher loader) {
        final Map<String, String> map = new TreeMap<String, String>();
        Map<String, String> m;
        for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
            if (!oaiFriend.getValue().exists()) {
                final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
                if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
            }

            if (oaiFriend.getValue().exists()) {
                final byte[] b = FileUtils.read(oaiFriend.getValue());
                if (b != null) {
View Full Code Here

                    if (profile == null) {
                        this.log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
                        return true;
                    }
                    try {
                        this.sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, new Response(urlEntry, profile), null, null));
                        Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false));
                    } catch (final InterruptedException e) {
                        Log.logException(e);
                    }
                    return true;
View Full Code Here

                    // returns null if everything went fine, a fail reason string if a problem occurred
                    try {
                        this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
                        final long maxFileSize = CrawlQueues.this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
                        final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
                        final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize, true);
                        if (response == null) {
                            this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
                            if (CrawlQueues.this.log.isFine()) CrawlQueues.this.log.logFine("problem loading " + this.request.url().toString() + ": no content (possibly caused by cache policy)");
                            result = "no content (possibly caused by cache policy)";
                        } else {
View Full Code Here

    }

    public void run() {
        RSSReader rss = null;
        try {
            final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
            final byte[] resource = response == null ? null : response.getContent();
            rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
        } catch (final MalformedURLException e) {
            Log.logWarning("Load_RSS", "rss loading for url '" + getName().substring(9) + "' failed: " + e.getMessage());
            return;
        } catch (final IOException e) {
View Full Code Here

            // which may be successful faster because of a cache hit
        }

        this.loaderSteering.put(url, new Semaphore(0));
        try {
            final Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist);
            check = this.loaderSteering.remove(url);
            if (check != null) check.release(1000);
            return response;
        } catch (final IOException e) {
            // release the semaphore anyway
View Full Code Here

                final RequestHeader requestHeader = new RequestHeader();
                requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
                DigestURI refererURL = null;
                if (request.referrerhash() != null) refererURL = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
                if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
                final Response response = new Response(
                        request,
                        requestHeader,
                        cachedResponse,
                        "200",
                        crawlProfile,
                        content);

                // check which caching strategy shall be used
                if (cacheStrategy == CacheStrategy.IFEXIST || cacheStrategy == CacheStrategy.CACHEONLY) {
                    // well, just take the cache and don't care about freshness of the content
                    this.log.logInfo("cache hit/useall for: " + url.toNormalform(true, false));
                    return response;
                }

                // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
                assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
                if (response.isFreshForProxy()) {
                    this.log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false));
                    return response;
                } else {
                    this.log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
                }
            } else if (cachedResponse != null) {
                this.log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false));
            } else if (content != null) {
                this.log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false));
            }
        }

        // check case where we want results from the cache exclusively, and never from the internet (offline mode)
        if (cacheStrategy == CacheStrategy.CACHEONLY) {
            // we had a chance to get the content from the cache .. its over. We don't have it.
            throw new IOException("cache only strategy");
        }

        // now forget about the cache, nothing there. Try to load the content from the internet

        // check access time: this is a double-check (we checked possibly already in the balancer)
        // to make sure that we don't DoS the target by mistake
        if (!url.isLocal()) {
            final Long lastAccess = accessTime.get(host);
            long wait = 0;
            if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis());
            if (wait > 0) {
                // force a sleep here. Instead just sleep we clean up the accessTime map
                final long untilTime = System.currentTimeMillis() + wait;
                cleanupAccessTimeTable(untilTime);
                if (System.currentTimeMillis() < untilTime)
                    try {Thread.sleep(untilTime - System.currentTimeMillis());} catch (final InterruptedException ee) {}
            }
        }

        // now it's for sure that we will access the target. Remember the access time
        if (host != null) accessTime.put(host, System.currentTimeMillis());

        // load resource from the internet
        Response response = null;
        if ((protocol.equals("http") || (protocol.equals("https")))) response = this.httpLoader.load(request, maxFileSize, checkBlacklist);
        if (protocol.equals("ftp")) response = this.ftpLoader.load(request, true);
        if (protocol.equals("smb")) response = this.smbLoader.load(request, true);
        if (protocol.equals("file")) response = this.fileLoader.load(request, true);
        if (response != null && response.getContent() != null) {
            // we got something. Now check if we want to store that to the cache
            // first check looks if we want to store the content to the cache
            if (!crawlProfile.storeHTCache()) {
                // no caching wanted. Thats ok, do not write any message
                return response;
            }
            // second check tells us if the protocoll tells us something about caching
            final String storeError = response.shallStoreCacheForCrawler();
            if (storeError == null) {
                try {
                    Cache.store(url, response.getResponseHeader(), response.getContent());
                } catch (final IOException e) {
                    this.log.logWarning("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e);
                }
            } else {
                this.log.logWarning("cannot write " + response.url() + " to Cache (4): " + storeError);
            }
            return response;
        }

        throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
View Full Code Here

     * @throws IOException
     */
    public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy) throws IOException {
        // try to download the resource using the loader
        final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
        final Response entry = load(request, cacheStrategy, maxFileSize, false);
        if (entry == null) return null; // not found in web

        // read resource body (if it is there)
        return entry.getContent();
    }
View Full Code Here

TOP

Related Classes of de.anomic.crawler.retrieval.Response

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.