Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Response


    }

    public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final long maxFileSize) throws IOException, Parser.Failure {

        // load resource
        final Response response = load(request, cacheStrategy, maxFileSize, false);
        final DigestURI url = request.url();
        if (response == null) throw new IOException("no Response for url " + url);

        // if it is still not available, report an error
        if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);

        // parse resource
        return response.parse();
    }
View Full Code Here


    }

    public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
        // load page
        final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
        final Response r = this.load(request(location, true, false), cachePolicy, maxFileSize, false);
        final byte[] page = (r == null) ? null : r.getContent();
        if (page == null) throw new IOException("no response from url " + location.toString());

        try {
          return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
        } catch(final Parser.Failure e) {
          throw new IOException(e.getMessage());
        }
    }
View Full Code Here

     * @param cacheStrategy the cache strategy
     * @return a map from URLs to the anchor texts of the urls
     * @throws IOException
     */
    public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy) throws IOException {
        final Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE, false);
        if (response == null) throw new IOException("response == null");
        final ResponseHeader responseHeader = response.getResponseHeader();
        byte[] resource = response.getContent();
        if (resource == null) throw new IOException("resource == null");
        if (responseHeader == null) throw new IOException("responseHeader == null");

        Document[] documents = null;
        final String supportError = TextParser.supports(url, responseHeader.mime());
View Full Code Here

        public void run() {
            if (this.cache != null && this.cache.exists()) return;
            try {
                // load from the net
                final Response response = load(request(new DigestURI(this.url), false, true), this.cacheStrategy, this.maxFileSize, true);
                final byte[] b = response.getContent();
                if (this.cache != null) FileUtils.copy(b, this.cache);
            } catch (final MalformedURLException e) {} catch (final IOException e) {}
        }
View Full Code Here

        // if we have an url then try to load the rss
        RSSReader rss = null;
        if (url != null) try {
            prop.put("url", url.toNormalform(true, false));
            final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true);
            final byte[] resource = response == null ? null : response.getContent();
            rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
        } catch (final IOException e) {
            Log.logException(e);
        }
View Full Code Here

        }

        // loading the resource content as byte array
        prop.put("error_incache", Cache.has(url) ? 1 : 0);

        Response response = null;
        try {
            response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, true);
        } catch (final IOException e) {
            prop.put("error", "4");
            prop.put("error_errorText", "error loading resource: " + e.getMessage());
            prop.put("viewMode", VIEW_MODE_NO_TEXT);
            return prop;
        }

        if (response == null) {
            prop.put("error", "4");
            prop.put("error_errorText", "No resource available");
            prop.put("viewMode", VIEW_MODE_NO_TEXT);
            return prop;
        }

        final String[] wordArray = wordArray(post.get("words", null));

        if (viewMode.equals("plain")) {

            // TODO: how to handle very large files here ?
            String content;
            try {
                content = UTF8.String(response.getContent());
            } catch (final Exception e) {
                prop.put("error", "4");
                prop.putHTML("error_errorText", e.getMessage());
                prop.put("viewMode", VIEW_MODE_NO_TEXT);
                return prop;
            }

            prop.put("error", "0");
            prop.put("viewMode", VIEW_MODE_AS_PLAIN_TEXT);
            prop.put("viewMode_plainText", markup(wordArray, content).replaceAll("\n", "<br />").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"));

        } else if (viewMode.equals("iframeWeb")) {
            prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_WEB);
            prop.put("viewMode_url", url.toNormalform(false, true));

        } else if (viewMode.equals("iframeCache")) {
            prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CACHE);
            final String ext = url.getFileExtension();
            if ("jpg.jpeg.png.gif".indexOf(ext) >= 0) {
                prop.put("viewMode_png", 1);
                prop.put("viewMode_png_url", url.toNormalform(false, true));
            } else {
                prop.put("viewMode_html", 1);
                prop.put("viewMode_html_url", url.toNormalform(false, true));
            }
        } else if (viewMode.equals("parsed") || viewMode.equals("sentences"|| viewMode.equals("words") || viewMode.equals("links")) {
            // parsing the resource content
            Document document = null;
            try {
                document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
                if (document == null) {
                    prop.put("error", "5");
                    prop.put("error_errorText", "Unknown error");
                    prop.put("viewMode", VIEW_MODE_NO_TEXT);
                    return prop;
                }
            } catch (final Parser.Failure e) {
                prop.put("error", "5");
                prop.putHTML("error_errorText", e.getMessage());
                prop.put("viewMode", VIEW_MODE_NO_TEXT);
                return prop;
            }

            if (viewMode.equals("parsed")) {
                final String content = UTF8.String(document.getTextBytes());
                // content = wikiCode.replaceHTML(content); // added by Marc Nause
                prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
                prop.put("viewMode_title", document.dc_title());
                prop.put("viewMode_creator", document.dc_creator());
                prop.put("viewMode_subject", document.dc_subject(','));
                prop.put("viewMode_description", document.dc_description());
                prop.put("viewMode_publisher", document.dc_publisher());
                prop.put("viewMode_format", document.dc_format());
                prop.put("viewMode_identifier", document.dc_identifier());
                prop.put("viewMode_source", url.toString());
                prop.put("viewMode_lat", document.lat());
                prop.put("viewMode_lon", document.lon());
                prop.put("viewMode_parsedText", markup(wordArray, content).replaceAll("\n", "<br />").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"));

            } else if (viewMode.equals("sentences")) {
                prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
                final Collection<StringBuilder> sentences = document.getSentences(pre);

                boolean dark = true;
                int i = 0;
                String sentence;
                if (sentences != null) {

                    // Search word highlighting
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        if (sentence.trim().length() > 0) {
                            prop.put("viewMode_sentences_" + i + "_nr", i + 1);
                            prop.put("viewMode_sentences_" + i + "_text", markup(wordArray, sentence));
                            prop.put("viewMode_sentences_" + i + "_dark", dark ? "1" : "0");
                            dark = !dark;
                            i++;
                        }
                    }
                }
                prop.put("viewMode_sentences", i);

            } else if (viewMode.equals("words")) {
                prop.put("viewMode", VIEW_MODE_AS_PARSED_WORDS);
                final Collection<StringBuilder> sentences = document.getSentences(pre);

                boolean dark = true;
                int i = 0;
                String sentence;
                StringBuilder token;
                if (sentences != null) {

                    // Search word highlighting
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        Enumeration<StringBuilder> tokens = null;
                        tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
                        while (tokens.hasMoreElements()) {
                            token = tokens.nextElement();
                            if (token.length() > 0) {
                                prop.put("viewMode_words_" + i + "_nr", i + 1);
                                prop.put("viewMode_words_" + i + "_word", token);
                                prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0");
                                dark = !dark;
                                i++;
                            }
                        }
                    }
                }
                prop.put("viewMode_words", i);

            } else if (viewMode.equals("links")) {
                prop.put("viewMode", VIEW_MODE_AS_LINKLIST);
                boolean dark = true;
                int i = 0;
                i += putMediaInfo(prop, wordArray, i, document.getVideolinks(), "video", (i % 2 == 0), document.getAnchors());
                i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0), document.getAnchors());
                dark = (i % 2 == 0);

                final Map<MultiProtocolURI, ImageEntry> ts = document.getImages();
                final Iterator<ImageEntry> tsi = ts.values().iterator();
                ImageEntry entry;
                while (tsi.hasNext()) {
                    entry = tsi.next();
                    prop.put("viewMode_links_" + i + "_nr", i);
                    prop.put("viewMode_links_" + i + "_dark", dark ? "1" : "0");
                    prop.put("viewMode_links_" + i + "_type", "image");
                    prop.put("viewMode_links_" + i + "_text", (entry.alt().isEmpty()) ? "&nbsp;" : markup(wordArray, entry.alt()));
                    prop.put("viewMode_links_" + i + "_url", entry.url().toNormalform(false, true));
                    prop.put("viewMode_links_" + i + "_link", markup(wordArray, entry.url().toNormalform(false, true)));
                    if (entry.width() > 0 && entry.height() > 0) {
                        prop.put("viewMode_links_" + i + "_rel", entry.width() + "x" + entry.height() + " Pixel");
                    } else {
                        prop.put("viewMode_links_" + i + "_rel", "");
                    }
                    prop.put("viewMode_links_" + i + "_name", "");
                    dark = !dark;
                    i++;
                }
                i += putMediaInfo(prop, wordArray, i, document.getApplinks(), "app", (i % 2 == 0), document.getAnchors());
                i += putMediaInfo(prop, wordArray, i, document.getHyperlinks(), "link", (i % 2 == 0), document.getAnchors());
                prop.put("viewMode_links", i);

            }
            if (document != null) document.close();
        }
        prop.put("error", "0");
        prop.put("error_url", url.toNormalform(false, true));
        prop.put("error_hash", urlHash);
        prop.put("error_wordCount", wordCount);
        prop.putHTML("error_desc", (descr.isEmpty()) ? "&nbsp;" : descr);
        prop.putNum("error_size", size);
        prop.put("error_mimeTypeAvailable", (response.getMimeType() == null) ? "0" : "1");
        prop.put("error_mimeTypeAvailable_mimeType", response.getMimeType());
        return prop;
    }
View Full Code Here

                        sb.crawler.defaultProxyProfile.handle(),
                        0,
                        0,
                        0,
                        0);
                final Response response = new Response(
                    request,
                        requestHeader,
                        cachedResponseHeader,
                        "200 OK",
                        sb.crawler.defaultProxyProfile
                );
                final byte[] cacheContent = Cache.getContent(url.hash());
                if (cacheContent != null && response.isFreshForProxy()) {
                    if (log.isFinest()) log.logFinest(reqID + " fulfill request from cache");
                    fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond);
                } else {
                    if (log.isFinest()) log.logFinest(reqID + " fulfill request from web");
                    fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond);
View Full Code Here

                        responseHeader);

                if (hasBody(client.getHttpResponse().getStatusLine().getStatusCode())) {

                    final OutputStream outStream = chunkedOut != null ? chunkedOut : respond;
                    final Response response = new Response(
                            request,
                            requestHeader,
                            responseHeader,
                            Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()),
                            sb.crawler.defaultProxyProfile
                    );
                    final String storeError = response.shallStoreCacheForProxy();
                    final boolean storeHTCache = response.profile().storeHTCache();
                    final String supportError = TextParser.supports(response.url(), response.getMimeType());
                    if (
                            /*
                             * Now we store the response into the htcache directory if
                             * a) the response is cacheable AND
                             */
                            (storeError == null) &&
                            /*
                             * b) the user has configured to use the htcache OR
                             * c) the content should be indexed
                             */
                            ((storeHTCache) || (supportError != null))
                    ) {
                        // we don't write actually into a file, only to RAM, and schedule writing the file.
//                        int l = res.getResponseHeader().size();
                      final int l = responseHeader.size();
                        final ByteArrayOutputStream byteStream = new ByteArrayOutputStream((l < 32) ? 32 : l);

                        final OutputStream toClientAndMemory = new MultiOutputStream(new OutputStream[] {outStream, byteStream});
//                        FileUtils.copy(res.getDataAsStream(), toClientAndMemory);
                        client.writeTo(toClientAndMemory);
                        // cached bytes
                        byte[] cacheArray;
                        if (byteStream.size() > 0) {
                            cacheArray = byteStream.toByteArray();
                        } else {
                            cacheArray = null;
                        }
                        if (log.isFine()) log.logFine(reqID +" writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length)));

                        if (sizeBeforeDelete == -1) {
                            // totally fresh file
                            response.setContent(cacheArray);
                            try {
                                Cache.store(response.url(), response.getResponseHeader(), cacheArray);
                                sb.toIndexer(response);
                            } catch (final IOException e) {
                                log.logWarning("cannot write " + response.url() + " to Cache (1): " + e.getMessage(), e);
                            }
                            conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_MISS");
                        } else if (cacheArray != null && sizeBeforeDelete == cacheArray.length) {
                            // before we came here we deleted a cache entry
                            cacheArray = null;
                            //cacheManager.push(cacheEntry); // unnecessary update
                            conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REF_FAIL_HIT");
                        } else {
                            // before we came here we deleted a cache entry
                            response.setContent(cacheArray);
                            try {
                                Cache.store(response.url(), response.getResponseHeader(), cacheArray);
                                sb.toIndexer(response);
                            } catch (final IOException e) {
                                log.logWarning("cannot write " + response.url() + " to Cache (2): " + e.getMessage(), e);
                            }
                            conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS");
                        }
                    } else {
                        // no caching
View Full Code Here

    public OAIPMHLoader(final LoaderDispatcher loader, final DigestURI source, final File targetDir, final String filePrefix) throws IOException {
        this.source = source;

        // load the file from the net
        final Response response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true);
        final byte[] b = response.getContent();
        this.resumptionToken = new ResumptionToken(source, b);
        //System.out.println("*** ResumptionToken = " + this.resumptionToken.toString());
        final File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source));
        final File f0 = new File(targetDir, f1.getName() + ".tmp");

View Full Code Here

    this.indexSegment = null;
  }

  public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure {
    if(this.document == null) {
      Response response = null;
      response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true);
      this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
    }
    return this.document;
 
View Full Code Here

TOP

Related Classes of de.anomic.crawler.retrieval.Response

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.