Examples of de.anomic.crawler.CrawlProfile

de.anomic.crawler.CrawlProfile

                  sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", code);                    
                  throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
                }


                // create a new cache entry
                final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
                response = new Response(
                        request,
                        requestHeader,
                        header, 
                        Integer.toString(code),

View Full Code Here

            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
            
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request, 
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    content.toString().getBytes());
            
            return response;
        }
        
        // create response header
        String mime = MimeTable.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
        
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
        try {
            size = url.length();
        } catch (Exception e) {
            size = -1;
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
            (size > maxFileSize && maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
            
            if (parserError != null) {
                log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
                log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
            
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request, 
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    url.toTokens().getBytes());
            return response;
        }
        
        // load the resource
        InputStream is = url.getInputStream(null, -1);
        byte[] b = FileUtils.read(is);
        is.close();
        
        // create response with loaded content
        final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
        Response response = new Response(
                request, 
                requestHeader,
                responseHeader,
                "200",

View Full Code Here

            StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true);
            
            ResponseHeader responseHeader = new ResponseHeader();
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request, 
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    content.toString().getBytes());
            
            return response;
        }
        
        // create response header
        String mime = MimeTable.ext2mime(url.getFileExtension());
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
        
        // check mime type and availability of parsers
        // and also check resource size and limitation of the size
        long size;
        try {
            size = url.length();
        } catch (Exception e) {
            size = -1;
        }
        String parserError = null;
        if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
            (size > maxFileSize && maxFileSize >= 0)) {
            // we know that we cannot process that file before loading
            // only the metadata is returned
            
            if (parserError != null) {
                log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
            } else {
                log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
            
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request, 
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    url.toTokens().getBytes());
            return response;
        }
        
        // load the resource
        InputStream is = url.getInputStream(null, -1);
        byte[] b = FileUtils.read(is);
        is.close();
        
        // create response with loaded content
        final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
        Response response = new Response(
                request, 
                requestHeader,
                responseHeader,
                "200",

View Full Code Here

                    response = null;
                } else {
                    ResponseHeader responseHeader = new ResponseHeader();
                    responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
                    responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
                    final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
                    response = new Response(
                            request, 
                            requestHeader,
                            responseHeader,
                            "200",

View Full Code Here

                log.logInfo("Too big file in FTP crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
            }
            
            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
            final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
            Response response = new Response(
                    request, 
                    requestHeader,
                    responseHeader,
                    "200",
                    profile,
                    null);
            return response;
        }
        
        // download the remote file
        byte[] b = ftpClient.get(path);
        
        // create a response
        final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes());
        Response response = new Response(
                request, 
                requestHeader,
                responseHeader,
                "200",

View Full Code Here

            
            Request urle;
            boolean dark = true;
            yacySeed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i, showNum = 0;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if (urle != null && urle.url() != null) {
                    initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
                    prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
                    prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString());
                    prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash());

View Full Code Here

                  final DigestURI startURL,
                  final String urlMustMatch,
                  final String urlMustNotMatch,
                  final int depth,
                  final boolean crawlingQ, final boolean medialink) {
      final CrawlProfile pe = new CrawlProfile(
                (startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), null,
                urlMustMatch,
                urlMustNotMatch,
                CrawlProfile.MATCH_ALL_STRING,
                CrawlProfile.MATCH_NEVER_STRING,
                "", depth, medialink,
                CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ,
                true, true, true, false, true, true, true,
                CacheStrategy.IFFRESH);
        sb.crawler.putActive(pe.handle().getBytes(), pe);
        return sb.crawlStacker.stackCrawl(new Request(
                sb.peers.mySeed().hash.getBytes(),
                startURL,
                null,
                "CRAWLING-ROOT",
                new Date(),
                pe.handle(), 0, 0, 0, 0
                ));
  }

View Full Code Here

        // read post for handle
        final String handle = (post == null) ? "" : post.get("handle", "");
        if (post != null) {
            if (post.containsKey("terminate")) try {
                // termination of a crawl: shift the crawl from active to passive
                final CrawlProfile p = sb.crawler.getActive(handle.getBytes());
                if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
                // delete all entries from the crawl queue that are deleted here
                sb.crawler.removeActive(handle.getBytes());
                sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
            } catch (final RowSpaceExceededException e) {
                Log.logException(e);
            }
            if (post.containsKey("delete")) {
                // deletion of a terminated crawl profile
                sb.crawler.removePassive(handle.getBytes());
            }
            if (post.containsKey("deleteTerminatedProfiles")) {
                for (final byte[] h: sb.crawler.getPassive()) {
                    sb.crawler.removePassive(h);
                }
            }
        }


        // generate handle list: first sort by handle name
        CrawlProfile selentry;
        final Map<String, String> orderdHandles = new TreeMap<String, String>();
        for (final byte[] h : sb.crawler.getActive()) {
            selentry = sb.crawler.getActive(h);
            if (selentry != null && !ignoreNames.contains(selentry.name())) {
                orderdHandles.put(selentry.name(), selentry.handle());
            }
        }


        // then write into pop-up menu list
        int count = 0;
        for (final Map.Entry<String, String> NameHandle: orderdHandles.entrySet()) {
            prop.put("profiles_" + count + "_name", NameHandle.getKey());
            prop.put("profiles_" + count + "_handle", NameHandle.getValue());
            if (handle.equals(NameHandle.getValue())) {
                prop.put("profiles_" + count + "_selected", "1");
            }
            count++;
        }
        prop.put("profiles", count);
        selentry = sb.crawler.getActive(handle.getBytes());
        assert selentry == null || selentry.handle() != null;
        // read post for change submit
        if ((post != null) && (selentry != null)) {
            if (post.containsKey("submit")) {
                try {
                  Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL_STRING));
                  Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER_STRING));
                    final Iterator<eentry> lit = labels.iterator();
                    eentry tee;
                    while (lit.hasNext()) {
                        tee = lit.next();
                        final String cval = selentry.get(tee.name);
                        final String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval);
                        if (!cval.equals(val)) {
                            selentry.put(tee.name, val);
                            sb.crawler.putActive(selentry.handle().getBytes(), selentry);
                        }
                    }
                } catch (final Exception ex) {
                    Log.logException(ex);
                    prop.put("error", "1");
                    prop.putHTML("error_message", ex.getMessage());
                }
            }
        }


        // generate crawl profile table
        count = 0;
        boolean dark = true;
        final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
        CrawlProfile profile;
        // put active crawls into list
        for (final byte[] h: sb.crawler.getActive()) {
            profile = sb.crawler.getActive(h);
            putProfileEntry(prop, sb.crawlStacker, profile, true, dark, count, domlistlength);
            dark = !dark;

View Full Code Here

            
            Request urle;
            boolean dark = true;
            Seed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i, showNum = 0;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if (urle != null && urle.url() != null) {
                    initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
                    prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
                    prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
                    prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());

View Full Code Here

                        final Pattern compiledPattern = Pattern.compile(pattern);
                        
                        if (option == PROFILE) {
                            // search and delete the crawl profile (_much_ faster, independant of queue size)
                            // XXX: what to do about the annoying LOST PROFILE messages in the log?
                            CrawlProfile entry;
                            for (final byte[] handle: sb.crawler.getActive()) {
                                entry = sb.crawler.getActive(handle);
                                final String name = entry.name();
                                if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)  ||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)  ||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
                                        name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
                                    continue;
                                if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes());
                            }
                        } else {
                            // iterating through the list of URLs
                            final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE);
                            Request entry;
                            final List<byte[]> removehashes = new ArrayList<byte[]>();
                            while (iter.hasNext()) {
                                if ((entry = iter.next()) == null) continue;
                                String value = null;
                                
                                location: switch (option) {
                                    case URL:       value = (entry.url() == null) ? null : entry.url().toString(); break location;
                                    case ANCHOR:    value = entry.name(); break location;
                                    case DEPTH:     value = Integer.toString(entry.depth()); break location;
                                    case INITIATOR:
                                        value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : ASCII.String(entry.initiator());
                                        break location;
                                    case MODIFIED:  value = daydate(entry.appdate()); break location;
                                    default: value = null; break location;
                                }
                                
                                if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash());
                            }
                            Log.logInfo("IndexCreateWWWLocalQueue", "created a remove list with " + removehashes.size() + " entries for pattern '" + pattern + "'");
                            for (final byte[] b: removehashes) {
                                sb.crawlQueues.noticeURL.removeByURLHash(b);
                            }
                        }
                    } catch (final PatternSyntaxException e) {
                        Log.logException(e);
                    }
                }
                
                prop.put("info", "3");//crawling queue cleared
                prop.putNum("info_numEntries", c);
            } else if (post.containsKey("deleteEntry")) {
                final String urlHash = post.get("deleteEntry");
                sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
                prop.put("LOCATION","");
                return prop;
            }
        }


        int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
        if (stackSize == 0) {
            prop.put("crawler-queue", "0");
        } else {
            prop.put("crawler-queue", "1");
            final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20));


            Request urle;
            boolean dark = true;
            Seed initiator;
            String profileHandle;
            CrawlProfile profileEntry;
            int i;
            for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
                urle = crawlerList.get(i);
                if ((urle != null)&&(urle.url()!=null)) {
                    initiator = sb.peers.getConnected(urle.initiator() == null ? "" : ASCII.String(urle.initiator()));
                    profileHandle = urle.profileHandle();
                    profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
                    prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
                    prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
                    prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
                    prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
                    prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
                    prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
                    prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
                    prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());

View Full Code Here

0 1 2 3

TOP

Related Classes of de.anomic.crawler.CrawlProfile

Crawler_p

CrawlProfileEditor_p

de.anomic.crawler.retrieval.FileLoader

de.anomic.crawler.retrieval.FTPLoader

de.anomic.crawler.retrieval.HTTPLoader

de.anomic.crawler.retrieval.SMBLoader

de.anomic.data.ymark.YMarkCrawlStart

de.anomic.search.Switchboard

import_ymark

IndexCreateWWWGlobalQueue_p

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.