Examples of ResponseHeader


Examples of net.yacy.cora.protocol.ResponseHeader

        String             seedListFileURL;
        DigestURI          url;
        Iterator<String>   enu;
        int                lc;
        final int          sc = this.peers.sizeConnected();
        ResponseHeader header;

        final RequestHeader reqHeader = new RequestHeader();
        reqHeader.put(HeaderFramework.PRAGMA, "no-cache");
        reqHeader.put(HeaderFramework.CACHE_CONTROL, "no-cache");
        reqHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
        final HTTPClient client = new HTTPClient();
        client.setHeader(reqHeader.entrySet());
        client.setTimout((int) getConfigLong("bootstrapLoadTimeout", 20000));

        yacyCore.log.logInfo("BOOTSTRAP: " + sc + " seeds known from previous run");

        // - use the superseed to further fill up the seedDB
        int ssc = 0, c = 0;
        while (true) {
            if (Thread.currentThread().isInterrupted()) {
                break;
            }
            seedListFileURL = sb.getConfig("network.unit.bootstrap.seedlist" + c, "");
            if (seedListFileURL.length() == 0) {
                break;
            }
            c++;
            if (
                    seedListFileURL.startsWith("http://") ||
                    seedListFileURL.startsWith("https://")
            ) {
                // load the seed list
                try {

                    url = new DigestURI(seedListFileURL);
                    //final long start = System.currentTimeMillis();
                    client.HEADResponse(url.toString());
                    header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
                    //final long loadtime = System.currentTimeMillis() - start;
                    /*if (header == null) {
                        if (loadtime > getConfigLong("bootstrapLoadTimeout", 6000)) {
                            yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, time-out after " + loadtime + " milliseconds");
                        } else {
                            yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not available, no content");
                        }
                    } else*/ if (header.lastModified() == null) {
                        yacyCore.log.logWarning("BOOTSTRAP: seed-list URL " + seedListFileURL + " not usable, last-modified is missing");
                    } else if ((header.age() > 86400000) && (ssc > 0)) {
                        yacyCore.log.logInfo("BOOTSTRAP: seed-list URL " + seedListFileURL + " too old (" + (header.age() / 86400000) + " days)");
                    } else {
                        ssc++;
                        final byte[] content = client.GETbytes(url);
                        enu = FileUtils.strings(content);
                        lc = 0;
                        while (enu.hasNext()) {
                            try {
                                ys = yacySeed.genRemoteSeed(enu.next(), null, false, null);
                                if ((ys != null) &&
                                    (!this.peers.mySeedIsDefined() || !this.peers.mySeed().hash.equals(ys.hash))) {
                                        final long lastseen = Math.abs((System.currentTimeMillis() - ys.getLastSeenUTC()) / 1000 / 60);
                                        if (lastseen < 240) {
                                            if (this.peers.peerActions.connectPeer(ys, false)) {
                                                lc++;
                                            }
                                        }
                                    }
                            } catch (final IOException e) {
                                yacyCore.log.logInfo("BOOTSTRAP: bad seed: " + e.getMessage());
                            }
                        }
                        yacyCore.log.logInfo("BOOTSTRAP: " + lc + " seeds from seed-list URL " + seedListFileURL + ", AGE=" + (header.age() / 3600000) + "h");
                    }

                } catch (final IOException e) {
                    // this is when wget fails, commonly because of timeout
                    yacyCore.log.logWarning("BOOTSTRAP: failed (1) to load seeds from seed-list URL " + seedListFileURL + ": " + e.getMessage());
View Full Code Here

Examples of net.yacy.cora.protocol.ResponseHeader

        // iterate over all images in the result
        final List<MediaSnippet> imagemedia = result.mediaSnippets();
        if (imagemedia != null) {
            feedloop: for (final MediaSnippet ms: imagemedia) {
                // check cache to see if the mime type of the image url is correct
                final ResponseHeader header = Cache.getResponseHeader(ms.href.hash());
                if (header != null) {
                    // this does not work for all urls since some of them may not be in the cache
                    if (header.mime().startsWith("text") || header.mime().startsWith("application")) continue feedloop;
                }
                this.images.put(new ReverseElement<MediaSnippet>(ms, ms.ranking)); // remove smallest in case of overflow
                c++;
                //System.out.println("*** image " + UTF8.String(ms.href.hash()) + " images.size = " + images.size() + "/" + images.size());
            }
View Full Code Here

Examples of net.yacy.cora.protocol.ResponseHeader

        // loading data from database
        Map<String, String> hdb;
        hdb = responseHeaderDB.get(hash);
        if (hdb == null) return null;
       
        return new ResponseHeader(null, hdb);
    }
View Full Code Here

Examples of net.yacy.cora.protocol.ResponseHeader

            // statistics:
            if (robotsTxt != null) {
              ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length);
            }
            final int code = client.getHttpResponse().getStatusLine().getStatusCode();
            final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
           
            // check the response status
            if (code > 199 && code < 300) {
              if (!header.mime().startsWith("text/plain")) {
                    robotsTxt = null;
                    log.info("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + header.mime() + "'.");
                } else {

                    // getting some metadata
                  eTag = header.containsKey(HeaderFramework.ETAG)?(header.get(HeaderFramework.ETAG)).trim():null;
                    lastMod = header.lastModified();
                   
                    // if the robots.txt file was not changed we break here
                    if ((eTag != null) && (oldEtag != null) && (eTag.equals(oldEtag))) {
                        if (log.isDebugEnabled()) log.debug("Robots.txt from URL '" + robotsURL + "' was not modified. Abort downloading of new version.");
                        return null;
                    }
                   
                   
                    downloadEnd = System.currentTimeMillis();                   
                    if (log.isDebugEnabled()) log.debug("Robots.txt successfully loaded from URL '" + robotsURL + "' in " + (downloadEnd-downloadStart) + " ms.");
                }
            } else if (code == 304) {
                return null;
            } else if (code > 299 && code < 400) {
                // getting redirection URL
              String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                if (redirectionUrlString==null) {
                    if (log.isDebugEnabled())
                    log.debug("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + client.getHttpResponse().getStatusLine() + "].");
                    robotsTxt = null;                   
                } else {
View Full Code Here

Examples of net.yacy.cora.protocol.ResponseHeader

        final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
        if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) {
            // we have passed a first test if caching is allowed
            // now see if there is a cache entry

            final ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
            final byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash());
            if (cachedResponse != null && content != null) {
                // yes we have the content

                // create request header values and a response object because we need that
View Full Code Here

Examples of net.yacy.cora.protocol.ResponseHeader

     * @throws IOException
     */
    public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy) throws IOException {
        final Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE, false);
        if (response == null) throw new IOException("response == null");
        final ResponseHeader responseHeader = response.getResponseHeader();
        byte[] resource = response.getContent();
        if (resource == null) throw new IOException("resource == null");
        if (responseHeader == null) throw new IOException("responseHeader == null");

        Document[] documents = null;
        final String supportError = TextParser.supports(url, responseHeader.mime());
        if (supportError != null) throw new IOException("no parser support: " + supportError);
        try {
            documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), resource.length, new ByteArrayInputStream(resource));
            if (documents == null) throw new IOException("document == null");
        } catch (final Exception e) {
            throw new IOException("parser error: " + e.getMessage());
        } finally {
            resource = null;
View Full Code Here

Examples of net.yacy.cora.protocol.ResponseHeader

   
    public Response(final Request request, final CrawlProfile profile) {
        this.request = request;
        // request and response headers may be zero in case that we process surrogates
        this.requestHeader = new RequestHeader();
        this.responseHeader = new ResponseHeader();
        if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
        this.responseStatus = "200";
        this.profile = profile;
        this.status = QUEUE_STATE_FRESH;
        this.content = request.url().toTokens().getBytes();
View Full Code Here

Examples of net.yacy.cora.protocol.ResponseHeader

        client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice
        client.setTimout(socketTimeout);
        client.setHeader(requestHeader.entrySet());
            // send request
          final byte[] responseBody = client.GETbytes(url, maxFileSize);
          final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
          final int code = client.getHttpResponse().getStatusLine().getStatusCode();

          if (code > 299 && code < 310) {
            // redirection (content may be empty)
                if (header.containsKey(HeaderFramework.LOCATION)) {
                    // getting redirection URL
                  String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                    redirectionUrlString = redirectionUrlString.trim();

                    if (redirectionUrlString.length() == 0) {
                        sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection header empy", code);
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
View Full Code Here

Examples of net.yacy.cora.protocol.ResponseHeader

        final HTTPClient client = new HTTPClient();
        client.setTimout(20000);
        client.setHeader(requestHeader.entrySet());
          final byte[] responseBody = client.GETbytes(request.url(), Long.MAX_VALUE);
          final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
          final int code = client.getHttpResponse().getStatusLine().getStatusCode();
            // FIXME: 30*-handling (bottom) is never reached
            // we always get the final content because httpClient.followRedirects = true

          if (responseBody != null && (code == 200 || code == 203)) {
                // the transfer is ok
           
            //statistics:
            ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);
               
                // we write the new cache entry to file system directly

                // create a new cache entry
                response = new Response(
                        request,
                        requestHeader,
                        header,
                        Integer.toString(code),
                        null,
                        responseBody
                );

                return response;
            } else if (code > 299 && code < 310) {
                if (header.containsKey(HeaderFramework.LOCATION)) {
                    // getting redirection URL
                  String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                    redirectionUrlString = redirectionUrlString.trim();

                    if (redirectionUrlString.length() == 0) {
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
                    }
View Full Code Here

Examples of net.yacy.cora.protocol.ResponseHeader

                list.add(u + ((u.endsWith("/") || u.endsWith("\\")) ? "" : "/") + s);
            }
        
            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
           
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    content.toString().getBytes());
           
            return response;
        }
       
        // create response header
        String mime = MimeTable.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
       
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
        try {
            size = url.length();
        } catch (Exception e) {
            size = -1;
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
            (size > maxFileSize && maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
           
            if (parserError != null) {
                log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
                log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
           
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
                    responseHeader,
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.