Package de.anomic.search

Examples of de.anomic.search.Segment$ReferenceCleaner


        prop.put("remoteCrawlState", "");
        prop.put("list-remote", 0);
        prop.put("forwardToCrawlStart", "0");

        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            final String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }

        prop.put("info", "0");

        if (post != null && post.containsKey("continue")) {
            // continue queue
            final String queue = post.get("continue", "");
            if ("localcrawler".equals(queue)) {
                sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
            } else if ("remotecrawler".equals(queue)) {
                sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
            }
        }

        if (post != null && post.containsKey("pause")) {
            // pause queue
            final String queue = post.get("pause", "");
            if ("localcrawler".equals(queue)) {
                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
            } else if ("remotecrawler".equals(queue)) {
                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
            }
        }

        if (post != null && post.containsKey("crawlingstart")) {
            // init crawl
            if (sb.peers == null) {
                prop.put("info", "3");
            } else {
                String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
                // add the prefix http:// if necessary
                int pos = crawlingStart.indexOf("://");
                if (pos == -1) {
                    if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart;
                    if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart;
                }

                // remove crawlingFileContent before we record the call
                final String crawlingFileName = post.get("crawlingFile");
                final File crawlingFile = (crawlingFileName != null && crawlingFileName.length() > 0) ? new File(crawlingFileName) : null;
                if (crawlingFile != null && crawlingFile.exists()) {
                    post.remove("crawlingFile$file");
                }

                // normalize URL
                DigestURI crawlingStartURL = null;
                if (crawlingFile == null) try {crawlingStartURL = new DigestURI(crawlingStart);} catch (final MalformedURLException e1) {Log.logException(e1);}
                crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);

                // set new properties
                final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start
                final boolean subPath    = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start

                // set the crawl filter
                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
                final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
                // special cases:
                if (crawlingStartURL!= null && fullDomain) {
                    if (crawlingStartURL.isFile()) {
                        newcrawlingMustMatch = "file://" + crawlingStartURL.getPath() + ".*";
                    } else if (crawlingStartURL.isSMB()) {
                        newcrawlingMustMatch = "smb://.*" + crawlingStartURL.getHost() + ".*" + crawlingStartURL.getPath() + ".*";
                    } else if (crawlingStartURL.isFTP()) {
                        newcrawlingMustMatch = "ftp://.*" + crawlingStartURL.getHost() + ".*" + crawlingStartURL.getPath() + ".*";
                    } else {
                        newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
                    }
                }
                if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
                    newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
                }

                final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
                env.setConfig("crawlOrder", crawlOrder);

                int newcrawlingdepth = post.getInt("crawlingDepth", 8);
                env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
                if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;

                // recrawl
                final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
                boolean crawlingIfOlderCheck = "on".equals(post.get("crawlingIfOlderCheck", "off"));
                int crawlingIfOlderNumber = post.getInt("crawlingIfOlderNumber", -1);
                String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
                int repeat_time = post.getInt("repeat_time", -1);
                final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays

                if ("scheduler".equals(recrawl) && repeat_time > 0) {
                    // set crawlingIfOlder attributes that are appropriate for scheduled crawling
                    crawlingIfOlderCheck = true;
                    crawlingIfOlderNumber = "selminutes".equals(repeat_unit) ? 1 : "selhours".equals(repeat_unit) ? repeat_time / 2 : repeat_time * 12;
                    crawlingIfOlderUnit = "hour";
                } else if ("reload".equals(recrawl)) {
                    repeat_time = -1;
                    crawlingIfOlderCheck = true;
                } else if ("nodoubles".equals(recrawl)) {
                    repeat_time = -1;
                    crawlingIfOlderCheck = false;
                }
                final long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
                env.setConfig("crawlingIfOlder", crawlingIfOlder);

                // store this call as api call
                if (repeat_time > 0) {
                    // store as scheduled api call
                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart), repeat_time, repeat_unit.substring(3));
                } else {
                    // store just a protocol
                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart));
                }

                final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off"));
                final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1;
                env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));

                final boolean crawlingQ = "on".equals(post.get("crawlingQ", "off"));
                env.setConfig("crawlingQ", crawlingQ);

                final boolean indexText = "on".equals(post.get("indexText", "on"));
                env.setConfig("indexText", indexText);

                final boolean indexMedia = "on".equals(post.get("indexMedia", "on"));
                env.setConfig("indexMedia", indexMedia);

                boolean storeHTCache = "on".equals(post.get("storeHTCache", "on"));
                if (crawlingStartURL!= null &&(crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
                env.setConfig("storeHTCache", storeHTCache);

                CacheStrategy cachePolicy = CacheStrategy.parse(post.get("cachePolicy", "iffresh"));
                if (cachePolicy == null) cachePolicy = CacheStrategy.IFFRESH;

                final boolean xsstopw = "on".equals(post.get("xsstopw", "off"));
                env.setConfig("xsstopw", xsstopw);

                final boolean xdstopw = "on".equals(post.get("xdstopw", "off"));
                env.setConfig("xdstopw", xdstopw);

                final boolean xpstopw = "on".equals(post.get("xpstopw", "off"));
                env.setConfig("xpstopw", xpstopw);

                final String crawlingMode = post.get("crawlingMode","url");
                if (crawlingStart != null && crawlingStart.startsWith("ftp")) {
                    try {
                        // check if the crawl filter works correctly
                        Pattern.compile(newcrawlingMustMatch);
                        final CrawlProfile profile = new CrawlProfile(
                                crawlingStart,
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText,
                                indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        final DigestURI url = crawlingStartURL;
                        sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false);
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "7"); // Error with file
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }
                    sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                } else if ("url".equals(crawlingMode)) {

                    // check if pattern matches
                    if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
                        // print error message
                        prop.put("info", "4"); //crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_crawlingStart", crawlingStart);
                    } else try {

                        // check if the crawl filter works correctly
                        Pattern.compile(newcrawlingMustMatch);

                        // stack request
                        // first delete old entry, if exists
                        final DigestURI url = new DigestURI(crawlingStart);
                        final byte[] urlhash = url.hash();
                        indexSegment.urlMetadata().remove(urlhash);
                        sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                        sb.crawlQueues.errorURL.remove(urlhash);

                        // stack url
                        sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
                        final CrawlProfile pe = new CrawlProfile(
                                (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText, indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(pe.handle().getBytes(), pe);
                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                sb.peers.mySeed().hash.getBytes(),
                                url,
                                null,
                                "CRAWLING-ROOT",
                                new Date(),
                                pe.handle(),
                                0,
                                0,
                                0,
                                0
                                ));

                        if (reasonString == null) {
                            // create a bookmark from crawl start url
                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
                            tags.add("crawlStart");
                            if ("on".equals(post.get("createBookmark","off"))) {
                            final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
                                if (bookmark != null) {
                                    bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
                                    bookmark.setOwner("admin");
                                    bookmark.setPublic(false);
                                    bookmark.setTags(tags, true);
                                    sb.bookmarksDB.saveBookmark(bookmark);
                                }
                            }
                            // liftoff!
                            prop.put("info", "8");//start msg
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));

                            // generate a YaCyNews if the global flag was set
                            if (!sb.isRobinsonMode() && crawlOrder) {
                                final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
                                m.remove("specificDepth");
                                m.remove("indexText");
                                m.remove("indexMedia");
                                m.remove("remoteIndexing");
                                m.remove("xsstopw");
                                m.remove("xpstopw");
                                m.remove("xdstopw");
                                m.remove("storeTXCache");
                                m.remove("storeHTCache");
                                m.remove("generalFilter");
                                m.remove("specificFilter");
                                m.put("intention", post.get("intention", "").replace(',', '/'));
                                sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
                            }
                        } else {
                            prop.put("info", "5"); //Crawling failed
                            prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
                            prop.putHTML("info_reasonString", reasonString);

                            sb.crawlQueues.errorURL.push(
                                new Request(
                                        sb.peers.mySeed().hash.getBytes(),
                                        crawlingStartURL,
                                        null,
                                        "",
                                        new Date(),
                                        pe.handle(),
                                        0,
                                        0,
                                        0,
                                        0),
                                sb.peers.mySeed().hash.getBytes(),
                                new Date(),
                                1,
                                FailCategory.FINAL_LOAD_CONTEXT,
                                reasonString, -1);
                        }
                    } catch (final PatternSyntaxException e) {
                        prop.put("info", "4"); // crawlfilter does not match url
                        prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                        prop.putHTML("info_error", e.getMessage());
                    } catch (final Exception e) {
                        // mist
                        prop.put("info", "6"); // Error with url
                        prop.putHTML("info_crawlingStart", crawlingStart);
                        prop.putHTML("info_error", e.getMessage());
                        Log.logException(e);
                    }

                } else if ("file".equals(crawlingMode)) {
                    if (post.containsKey("crawlingFile")) {
                        final String crawlingFileContent = post.get("crawlingFile$file", "");
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
                            final ContentScraper scraper = new ContentScraper(new DigestURI(crawlingFile));
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            if (crawlingFile != null && crawlingFile.exists()) {
                                FileUtils.copy(new FileInputStream(crawlingFile), writer);
                            } else {
                                FileUtils.copy(crawlingFileContent, writer);
                            }
                            writer.close();

                            // get links and generate filter
                            final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                            if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                            final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
                            final CrawlProfile profile = new CrawlProfile(
                                    crawlingFileName,
                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER,
                                    newcrawlingdepth,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText,
                                    indexMedia,
                                    storeHTCache,
                                    crawlOrder,
                                    xsstopw,
                                    xdstopw,
                                    xpstopw,
                                    cachePolicy);
                            sb.crawler.putActive(profile.handle().getBytes(), profile);
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); // crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
                        } catch (final Exception e) {
                            // mist
                            prop.put("info", "7"); // Error with file
                            prop.putHTML("info_crawlingStart", crawlingFileName);
                            prop.putHTML("info_error", e.getMessage());
                            Log.logException(e);
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
                } else if ("sitemap".equals(crawlingMode)) {
                    final String sitemapURLStr = post.get("sitemapURL","");
                  try {
                    final DigestURI sitemapURL = new DigestURI(sitemapURLStr);
                    final CrawlProfile pe = new CrawlProfile(
                        sitemapURLStr,
                        sitemapURL,
                        CrawlProfile.MATCH_ALL,
                        CrawlProfile.MATCH_NEVER,
                        0,
                        crawlingIfOlder,
                        crawlingDomMaxPages,
                        true,
                        indexText,
                        indexMedia,
                        storeHTCache,
                        crawlOrder,
                        xsstopw,
                        xdstopw,
                        xpstopw,
                        cachePolicy);
                    sb.crawler.putActive(pe.handle().getBytes(), pe);
                    final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
                    importer.start();
                  } catch (final Exception e) {
                    // mist
                    prop.put("info", "6");//Error with url
                    prop.putHTML("info_crawlingStart", sitemapURLStr);
                    prop.putHTML("info_error", e.getMessage());
                    Log.logException(e);
                  }
                } else if ("sitelist".equals(crawlingMode)) {
                    try {
                        final DigestURI sitelistURL = new DigestURI(crawlingStart);
                        // download document
                        ContentScraper scraper = null;
                        scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
                        // String title = scraper.getTitle();
                        // String description = scraper.getDescription();

                        // get links and generate filter
                        final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
                        if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());

                        // put links onto crawl queue
                        final CrawlProfile profile = new CrawlProfile(
                                sitelistURL.getHost(),
                                sitelistURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText,
                                indexMedia,
                                storeHTCache,
                                crawlOrder,
                                xsstopw,
                                xdstopw,
                                xpstopw,
                                cachePolicy);
                        sb.crawler.putActive(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                        final Iterator<Map.Entry<MultiProtocolURI, Properties>> linkiterator = hyperlinks.entrySet().iterator();
                        DigestURI nexturl;
                        while (linkiterator.hasNext()) {
                            final Map.Entry<MultiProtocolURI, Properties> e = linkiterator.next();
                            if (e.getKey() == null) continue;
                            nexturl = new DigestURI(e.getKey());
                            // remove the url from the database to be prepared to crawl them again
                            final byte[] urlhash = nexturl.hash();
                            indexSegment.urlMetadata().remove(urlhash);
                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
                            sb.crawlQueues.errorURL.remove(urlhash);
                            sb.crawlStacker.enqueueEntry(new Request(
                                    sb.peers.mySeed().hash.getBytes(),
                                    nexturl,
View Full Code Here


        }

        final int display = post.getInt("display", 1);

        // get segment
        Segment indexSegment = null;
        final boolean authorized = sb.verifyAuthentication(header, false);
        if (post != null && post.containsKey("segment") && authorized) {
            indexSegment = sb.indexSegments.segment(post.get("segment"));
        } else {
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }

        prop.put("display", display);
        prop.put("error_display", display);

        if (post.containsKey("words"))
            prop.putHTML("error_words", post.get("words"));
        else {
            prop.putHTML("error_words", "");
        }

        final String viewMode = post.get("viewMode","parsed");
        prop.put("error_vMode-" + viewMode, "1");

        DigestURI url = null;
        String descr = "";
        final int wordCount = 0;
        int size = 0;
        boolean pre = false;

        // get the url hash from which the content should be loaded
        String urlHash = post.get("urlHash", "");
        URIMetadataRow urlEntry = null;
        // get the urlEntry that belongs to the url hash
        if (urlHash.length() > 0 && (urlEntry = indexSegment.urlMetadata().load(ASCII.getBytes(urlHash))) != null) {
            // get the url that belongs to the entry
            final URIMetadataRow.Components metadata = urlEntry.metadata();
            if ((metadata == null) || (metadata.url() == null)) {
                prop.put("error", "3");
                prop.put("viewMode", VIEW_MODE_NO_TEXT);
View Full Code Here

       
        final serverObjects prop = new serverObjects();
        if ((post == null) || (env == null)) return prop;
        final boolean authenticated = sb.adminAuthenticated(header) >= 2;
       
        Segment segment = null;
        if (post.containsKey("segment") && authenticated) {
            segment = sb.indexSegments.segment(post.get("segment"));
        } else {
            segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }
       
        final String  querystring = post.get("query", "")// a string of word hashes that shall be searched and combined
        final int     count  = Math.min((authenticated) ? 1000 : 10, post.getInt("maximumRecords", 1000)); // SRU syntax
        final int     maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
        String  language = post.get("language", "");
        if (!ISO639.exists(language)) {
            // take language from the user agent
            String agent = header.get("User-Agent");
            if (agent == null) agent = System.getProperty("user.language");
            language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent);
            if (language == null) language = "en";
        }
        final TreeSet<String>[] query = QueryParams.cleanQuery(querystring); // converts also umlaute
        HandleSet q = Word.words2hashesHandles(query[0]);
       
        // tell all threads to do nothing for a specific time
        sb.intermissionAllThreads(3000);

        // prepare search
        final long timestamp = System.currentTimeMillis();
       
        // prepare an abstract result
        int indexabstractContainercount = 0;
        int joincount = 0;

        // retrieve index containers
        //yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links");
       
        // get the index container with the result vector
        TermSearch<WordReference> search = null;
        try {
            search = segment.termIndex().query(q, Word.words2hashesHandles(query[1]), null, Segment.wordReferenceFactory, maxdist);
        } catch (RowSpaceExceededException e) {
            Log.logException(e);
        }
        ReferenceContainer<WordReference> index = search.joined();
       
View Full Code Here

        CacheStrategy snippetFetchStrategy = (post == null) ? null : CacheStrategy.parse(post.get("verify", "cacheonly"));
        final servletProperties prop = new servletProperties();
        prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0);

        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            final String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }

        final String EXT = header.get("EXT", "");
        final boolean rss = EXT.equals("rss");
        final boolean json = EXT.equals("json");
        prop.put("promoteSearchPageGreeting", promoteSearchPageGreeting);
        prop.put("promoteSearchPageGreeting.homepage", sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, ""));
        prop.put("promoteSearchPageGreeting.smallImage", sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, ""));
        if (post == null || indexSegment == null || env == null || !searchAllowed) {
            // we create empty entries for template strings
            prop.put("searchagain", "0");
            prop.put("former", "");
            prop.put("count", "10");
            prop.put("offset", "0");
            prop.put("resource", "global");
            prop.put("urlmaskfilter", (post == null) ? ".*" : post.get("urlmaskfilter", ".*"));
            prop.put("prefermaskfilter", (post == null) ? "" : post.get("prefermaskfilter", ""));
            prop.put("tenant", (post == null) ? "" : post.get("tenant", ""));
            prop.put("indexof", "off");
            prop.put("constraint", "");
            prop.put("cat", "href");
            prop.put("depth", "0");
            prop.put("search.verify", (post == null) ? sb.getConfig("search.verify", "iffresh") : post.get("verify", "iffresh"));
            prop.put("search.navigation", (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "all"));
            prop.put("contentdom", "text");
            prop.put("contentdomCheckText", "1");
            prop.put("contentdomCheckAudio", "0");
            prop.put("contentdomCheckVideo", "0");
            prop.put("contentdomCheckImage", "0");
            prop.put("contentdomCheckApp", "0");
            prop.put("excluded", "0");
            prop.put("results", "");
            prop.put("resultTable", "0");
            prop.put("num-results", searchAllowed ? "0" : "4");
            prop.put("num-results_totalcount", 0);
            prop.put("num-results_offset", 0);
            prop.put("num-results_itemsPerPage", 10);
            prop.put("geoinfo", "0");
            prop.put("rss_queryenc", "");
            prop.put("meanCount", 5);
            return prop;
        }

        // check for JSONP
        if (post.containsKey("callback")) {
          final String jsonp = post.get("callback")+ "([";
          prop.put("jsonp-start", jsonp);
          prop.put("jsonp-end", "])");
        } else {
          prop.put("jsonp-start", "");
          prop.put("jsonp-end", "");
        }

        // Adding CORS Access header for yacysearch.rss output
        if (rss) {
            final ResponseHeader outgoingHeader = new ResponseHeader();
            outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*");
            prop.setOutgoingHeader(outgoingHeader);
        }

        // collect search attributes
        final boolean newsearch =post.hasValue("query") && post.hasValue("former") && !post.get("query","").equalsIgnoreCase(post.get("former","")); //new search term

        int itemsPerPage = Math.min((authenticated) ? (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() ? 100 : 1000) : (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() ? 20 : 500), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative
        int offset = (newsearch) ? 0 : post.getInt("startRecord", post.getInt("offset", 0));

        final int newcount;
        if ( authenticated && (newcount = post.getInt("count", 0)) > 0 ) {
            sb.setConfig(SwitchboardConstants.SEARCH_ITEMS, newcount);
        } // set new default maximumRecords if search with "more options"

        boolean global = post.get("resource", "local").equals("global") && sb.peers.sizeConnected() > 0;
        final boolean indexof = (post != null && post.get("indexof","").equals("on"));

        final String originalUrlMask;
        if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) { // option search all
            originalUrlMask = ".*";
        } else if (!newsearch && post.containsKey("urlmaskfilter")) {
            originalUrlMask = post.get("urlmaskfilter", ".*");
        } else {
            originalUrlMask = ".*";
        }

        String prefermask = (post == null) ? "" : post.get("prefermaskfilter", "");
        if (!prefermask.isEmpty() && prefermask.indexOf(".*") < 0) {
            prefermask = ".*" + prefermask + ".*";
        }

        Bitfield constraint = (post != null && post.containsKey("constraint") && !post.get("constraint", "").isEmpty()) ? new Bitfield(4, post.get("constraint", "______")) : null;
        if (indexof) {
            constraint = new Bitfield(4);
            constraint.set(Condenser.flag_cat_indexof, true);
        }

        // SEARCH
        final boolean indexReceiveGranted = sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true) ||
                sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true);
        global = global && indexReceiveGranted; // if the user does not want indexes from remote peers, it cannot be a global search

        final boolean clustersearch = sb.isRobinsonMode() &&
                (sb.getConfig("cluster.mode", "").equals("privatecluster") ||
        sb.getConfig("cluster.mode", "").equals("publiccluster"));
        if (clustersearch) {
            global = true;
        } // switches search on, but search target is limited to cluster nodes

        // increase search statistic counter
        if (!global) {
            // we count only searches on the local peer here, because global searches
            // are counted on the target peer to preserve privacy of the searcher
            if (authenticated) {
                // local or authenticated search requests are counted separately
                // because they are not part of a public available peer statistic
                sb.searchQueriesRobinsonFromLocal++;
            } else {
                // robinson-searches from non-authenticated requests are public
                // and may be part of the public available statistic
                sb.searchQueriesRobinsonFromRemote++;
            }
        }

        // find search domain
        final ContentDomain contentdom = ContentDomain.contentdomParser(post == null ? "text" : post.get("contentdom", "text"));

        // patch until better search profiles are available
        if ((contentdom != ContentDomain.TEXT) && (itemsPerPage <= 32)) {
            itemsPerPage = 64;
        }

        // check the search tracker
        TreeSet<Long> trackerHandles = sb.localSearchTracker.get(client);
        if (trackerHandles == null) {
            trackerHandles = new TreeSet<Long>();
        }
        boolean block = false;
        if (Domains.matchesList(client, sb.networkBlacklist)) {
            global = false;
            if (snippetFetchStrategy != null) {
                snippetFetchStrategy = null;
            }
            block = true;
            Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search");
        } else if (Domains.matchesList(client, sb.networkWhitelist)) {
            Log.logInfo("LOCAL_SEARCH", "ACCESS CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions");
        } else if (!authenticated && !localhostAccess) {
            // in case that we do a global search or we want to fetch snippets, we check for DoS cases
            synchronized (trackerHandles) {
                final int accInThreeSeconds = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size();
                final int accInOneMinute = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size();
                final int accInTenMinutes = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size();
                // protections against too strong YaCy network load, reduces remote search
                if (global) {
                    if (accInTenMinutes >= 60 || accInOneMinute >= 6 || accInThreeSeconds >= 1) {
                        global = false;
                        Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed global search");
                    }
                }
                // protection against too many remote server snippet loads (protects traffic on server)
                if (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline()) {
                    if (accInTenMinutes >= 20 || accInOneMinute >= 4 || accInThreeSeconds >= 1) {
                        snippetFetchStrategy = CacheStrategy.CACHEONLY;
                        Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed remote snippet loading");
                    }
                }
                // general load protection
                if (accInTenMinutes >= 3000 || accInOneMinute >= 600 || accInThreeSeconds >= 60) {
                    block = true;
                    Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + "/3s, " + accInOneMinute + "/60s, " + accInTenMinutes + "/600s, " + " requests, disallowed search");
                }
            }
        }

        if ((!block) && (post == null || post.get("cat", "href").equals("href"))) {
            String urlmask = null;

            // check available memory and clean up if necessary
            if (!MemoryControl.request(8000000L, false)) {
                indexSegment.urlMetadata().clearCache();
                SearchEventCache.cleanupEvents(true);
            }

            final RankingProfile ranking = sb.getRanking();

            if (querystring.indexOf("/near") >= 0) {
              querystring = querystring.replace("/near", "");
              ranking.coeff_worddistance = RankingProfile.COEFF_MAX;
            }
            if (querystring.indexOf("/date") >= 0) {
                querystring = querystring.replace("/date", "");
                ranking.coeff_date = RankingProfile.COEFF_MAX;
            }
            if (querystring.indexOf("/http") >= 0) {
                querystring = querystring.replace("/http", "");
                urlmask = "http://.*";
            }
            if (querystring.indexOf("/https") >= 0) {
                querystring = querystring.replace("/https", "");
                urlmask = "https://.*";
            }
            if (querystring.indexOf("/ftp") >= 0) {
                querystring = querystring.replace("/ftp", "");
                urlmask = "ftp://.*";
            }
            if (querystring.indexOf("/smb") >= 0) {
                querystring = querystring.replace("/smb", "");
                urlmask = "smb://.*";
            }
            if (querystring.indexOf("/file") >= 0) {
                querystring = querystring.replace("/file", "");
                urlmask = "file://.*";
            }
            if (querystring.indexOf("/location") >= 0) {
                querystring = querystring.replace("/location", "");
                if (constraint == null) {
                    constraint = new Bitfield(4);
                }
                constraint.set(Condenser.flag_cat_haslocation, true);
            }
            final int lrp = querystring.indexOf("/language/");
            String lr = "";
            if (lrp >= 0) {
                if (querystring.length() >= (lrp + 11)) {
                    lr = querystring.substring(lrp + 9, lrp + 11);
                }

                querystring = querystring.replace("/language/" + lr, "");
                lr = lr.toLowerCase();
            }
            final int inurl = querystring.indexOf("inurl:");
            if (inurl >= 0) {
                int ftb = querystring.indexOf(' ', inurl);
                if (ftb == -1) {
                    ftb = querystring.length();
                }
                final String urlstr = querystring.substring(inurl + 6, ftb);
                querystring = querystring.replace("inurl:" + urlstr, "");
                if (!urlstr.isEmpty()) {
                    urlmask = ".*" + urlstr + ".*";
                }
            }
            final int filetype = querystring.indexOf("filetype:");
            if (filetype >= 0) {
                int ftb = querystring.indexOf(' ', filetype);
                if (ftb == -1) {
                    ftb = querystring.length();
                }
                String ft = querystring.substring(filetype + 9, ftb);
                querystring = querystring.replace("filetype:" + ft, "");
                while (!ft.isEmpty() && ft.charAt(0) == '.') ft = ft.substring(1);
                if (!ft.isEmpty()) {
                    if (urlmask == null) {
                        urlmask = ".*\\." + ft;
                    } else {
                        urlmask = urlmask + ".*\\." + ft;
                    }
                }
            }
            String tenant = null;
            if (post.containsKey("tenant")) {
                tenant = post.get("tenant");
                if (tenant != null && tenant.isEmpty()) {
                    tenant = null;
                }
                if (tenant != null) {
                    if (urlmask == null) {
                        urlmask = ".*" + tenant + ".*";
                    } else urlmask = ".*" + tenant + urlmask;
                }
            }
            final int site = querystring.indexOf("site:");
            String sitehash = null;
            String sitehost = null;
            if (site >= 0) {
                int ftb = querystring.indexOf(' ', site);
                if (ftb == -1) {
                    ftb = querystring.length();
                }
                sitehost = querystring.substring(site + 5, ftb);
                querystring = querystring.replace("site:" + sitehost, "");
                while (sitehost.length() > 0 && sitehost.charAt(0) == '.') {
                    sitehost = sitehost.substring(1);
                }
                while (sitehost.endsWith(".")) {
                    sitehost = sitehost.substring(0, sitehost.length() - 1);
                }
                sitehash = DigestURI.hosthash(sitehost);
            }

            final int heuristicScroogle = querystring.indexOf("heuristic:scroogle");
            if (heuristicScroogle >= 0) {
                querystring = querystring.replace("heuristic:scroogle", "");
            }

            final int heuristicBlekko = querystring.indexOf("heuristic:blekko");
            if (heuristicBlekko >= 0) {
                querystring = querystring.replace("heuristic:blekko", "");
            }

            final int authori = querystring.indexOf("author:");
          String authorhash = null;
            if (authori >= 0) {
              // check if the author was given with single quotes or without
              final boolean quotes = (querystring.charAt(authori + 7) == (char) 39);
              String author;
              if (quotes) {
                    int ftb = querystring.indexOf((char) 39, authori + 8);
                    if (ftb == -1) {
                        ftb = querystring.length() + 1;
                    }
                    author = querystring.substring(authori + 8, ftb);
                    querystring = querystring.replace("author:'" + author + "'", "");
              } else {
                    int ftb = querystring.indexOf(' ', authori);
                    if (ftb == -1) {
                        ftb = querystring.length();
                    }
                    author = querystring.substring(authori + 7, ftb);
                    querystring = querystring.replace("author:" + author, "");
              }
              authorhash = ASCII.String(Word.word2hash(author));
            }
            final int tld = querystring.indexOf("tld:");
            if (tld >= 0) {
                int ftb = querystring.indexOf(' ', tld);
                if (ftb == -1) {
                    ftb = querystring.length();
                }
                String domain = querystring.substring(tld + 4, ftb);
                querystring = querystring.replace("tld:" + domain, "");
                while (domain.length() > 0 && domain.charAt(0) == '.') {
                    domain = domain.substring(1);
                }
                if (domain.indexOf('.') < 0) {
                    domain = "\\." + domain;
                } // is tld
                if (domain.length() > 0) {
                    urlmask = "[a-zA-Z]*://[^/]*" + domain + "/.*" + ((urlmask != null) ? urlmask : "");
                }
            }
            if (urlmask == null || urlmask.isEmpty()) {
                urlmask = originalUrlMask;
            } //if no urlmask was given

            // read the language from the language-restrict option 'lr'
            // if no one is given, use the user agent or the system language as default
            String language = (post == null) ? lr : post.get("lr", lr);
            if (language.startsWith("lang_")) {
                language = language.substring(5);
            }
            if (!ISO639.exists(language)) {
                // find out language of the user by reading of the user-agent string
                String agent = header.get(HeaderFramework.ACCEPT_LANGUAGE);
                if (agent == null) {
                    agent = System.getProperty("user.language");
                }
                language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent);
                if (language == null) {
                    language = "en";
                }
            }

            // navigation
            final String navigation = (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "");

            // the query
            final TreeSet<String>[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute

            final int maxDistance = (querystring.indexOf('"') >= 0) ? query.length - 1 : Integer.MAX_VALUE;

            // filter out stopwords
            final SortedSet<String> filtered = SetTools.joinConstructive(query[0], Switchboard.stopwords);
            if (!filtered.isEmpty()) {
                SetTools.excludeDestructive(query[0], Switchboard.stopwords);
            }

            // if a minus-button was hit, remove a special reference first
            if (post != null && post.containsKey("deleteref")) {
                try {
                    if (!sb.verifyAuthentication(header, true)) {
                        prop.put("AUTHENTICATE", "admin log-in"); // force log-in
                        return prop;
                    }

                    // delete the index entry locally
                    final String delHash = post.get("deleteref", ""); // urlhash
                    indexSegment.termIndex().remove(Word.words2hashesHandles(query[0]), delHash.getBytes());

                    // make new news message with negative voting
                    if (!sb.isRobinsonMode()) {
                        final Map<String, String> map = new HashMap<String, String>();
                        map.put("urlhash", delHash);
                        map.put("vote", "negative");
                        map.put("refid", "");
                        sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, map);
                    }
                } catch (final IOException e) {
                    Log.logException(e);
                }
            }

            // if a plus-button was hit, create new voting message
            if (post != null && post.containsKey("recommendref")) {
                if (!sb.verifyAuthentication(header, true)) {
                    prop.put("AUTHENTICATE", "admin log-in"); // force log-in
                    return prop;
                }
                final String recommendHash = post.get("recommendref", ""); // urlhash
                final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash));
                if (urlentry != null) {
                    final URIMetadataRow.Components metadata = urlentry.metadata();
                    Document[] documents = null;
                    try {
                        documents = sb.loader.loadDocuments(sb.loader.request(metadata.url(), true, false), CacheStrategy.IFEXIST, 5000, Long.MAX_VALUE);
                    } catch (final IOException e) {
                    } catch (final Parser.Failure e) {
                    }
                    if (documents != null) {
                        // create a news message
                        final Map<String, String> map = new HashMap<String, String>();
                        map.put("url", metadata.url().toNormalform(false, true).replace(',', '|'));
                        map.put("title", metadata.dc_title().replace(',', ' '));
                        map.put("description", documents[0].dc_title().replace(',', ' '));
                        map.put("author", documents[0].dc_creator());
                        map.put("tags", documents[0].dc_subject(' '));
                        sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_SURFTIPP_ADD, map);
                        documents[0].close();
                    }
                }
            }

            // prepare search properties
            final boolean globalsearch = (global) && indexReceiveGranted;

            // do the search
            final HandleSet queryHashes = Word.words2hashesHandles(query[0]);
            final Pattern snippetPattern = QueryParams.stringSearchPattern(originalquerystring);

            // check filters
            try {
                Pattern.compile(urlmask);
            } catch (final PatternSyntaxException ex) {
                Log.logWarning("SEARCH", "Illegal URL mask, not a valid regex: " + urlmask);
                prop.put("urlmaskerror", 1);
                prop.putHTML("urlmaskerror_urlmask", urlmask);
                urlmask = ".*";
            }

            try {
                Pattern.compile(prefermask);
            } catch (final PatternSyntaxException ex) {
                Log.logWarning("SEARCH", "Illegal prefer mask, not a valid regex: " + prefermask);
                prop.put("prefermaskerror", 1);
                prop.putHTML("prefermaskerror_prefermask", prefermask);
                prefermask = "";
            }

            final QueryParams theQuery = new QueryParams(
                    originalquerystring,
                    queryHashes,
                    Word.words2hashesHandles(query[1]),
                    Word.words2hashesHandles(query[2]),
                    snippetPattern,
                    tenant,
                    maxDistance,
                    prefermask,
                    contentdom,
                    language,
                    navigation,
                    snippetFetchStrategy,
                    itemsPerPage,
                    offset,
                    urlmask,
                    (clustersearch && globalsearch) ? QueryParams.SEARCHDOM_CLUSTERALL :
                    ((globalsearch) ? QueryParams.SEARCHDOM_GLOBALDHT : QueryParams.SEARCHDOM_LOCAL),
                    20,
                    constraint,
                    true,
                    sitehash,
                    authorhash,
                    DigestURI.TLD_any_zone_filter,
                    client,
                    authenticated,
                    indexSegment,
                    ranking,
                    header.get(RequestHeader.USER_AGENT, ""),
                    sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) && sb.peers.mySeed().getFlagAcceptRemoteIndex());
            EventTracker.delete(EventTracker.EClass.SEARCH);
            EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.Type.INITIALIZATION, "", 0, 0), false);

            // tell all threads to do nothing for a specific time
            sb.intermissionAllThreads(3000);

            // filter out words that appear in bluelist
            theQuery.filterOut(Switchboard.blueList);

            // log
            Log.logInfo("LOCAL_SEARCH", "INIT WORD SEARCH: " + theQuery.queryString + ":" + QueryParams.hashSet2hashString(theQuery.queryHashes) + " - " + theQuery.neededResults() + " links to be computed, " + theQuery.displayResults() + " lines to be displayed");
            yacyChannel.channels(yacyChannel.LOCALSEARCH).addMessage(new RSSMessage("Local Search Request", theQuery.queryString, ""));
            final long timestamp = System.currentTimeMillis();

            // create a new search event
            if (SearchEventCache.getEvent(theQuery.id(false)) == null) {
                theQuery.setOffset(0); // in case that this is a new search, always start without a offset
                offset = 0;
            }
            final SearchEvent theSearch = SearchEventCache.getEvent(
                theQuery, sb.peers, sb.tables, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader,
                (int) sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXCOUNT_USER, sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXCOUNT_DEFAULT, 10)),
                      sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXTIME_USER, sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXTIME_DEFAULT, 3000)),
                (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0),
                (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));

            if (offset == 0) {
                if (sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated) {
                    sb.heuristicSite(theSearch, sitehost);
                }
                if ((heuristicScroogle >= || sb.getConfigBool("heuristic.scroogle", false)) && authenticated) {
                    sb.heuristicScroogle(theSearch);
                }
                if ((heuristicBlekko >= || sb.getConfigBool("heuristic.blekko", false)) && authenticated) {
                    sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko");
                }
            }

            // log
            Log.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: " + theQuery.queryString + " - " +
                    "local-unfiltered(" + theSearch.getRankingResult().getLocalIndexCount() + "), " +
                    "-local_miss(" + theSearch.getRankingResult().getMissCount() + "), " +
                    "-local_sortout(" + theSearch.getRankingResult().getSortOutCount() + "), " +
                    "remote(" + theSearch.getRankingResult().getRemoteResourceSize() + ") links found, " +
                    (System.currentTimeMillis() - timestamp) + " ms");

            // prepare search statistics
            theQuery.resultcount = theSearch.getRankingResult().getLocalIndexCount() - theSearch.getRankingResult().getMissCount() - theSearch.getRankingResult().getSortOutCount() + theSearch.getRankingResult().getRemoteIndexCount();
            theQuery.searchtime = System.currentTimeMillis() - timestamp;
            theQuery.urlretrievaltime = theSearch.result().getURLRetrievalTime();
            theQuery.snippetcomputationtime = theSearch.result().getSnippetComputationTime();
            AccessTracker.add(AccessTracker.Location.local, theQuery);

            // check suggestions
            final int meanMax = (post != null) ? post.getInt("meanCount", 0) : 0;

            prop.put("meanCount", meanMax);
            if (meanMax > 0 && !json && !rss) {
                final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), querystring);
              final Iterator<String> meanIt = didYouMean.getSuggestions(100, 5).iterator();
                int meanCount = 0;
                String suggestion;
                while( meanCount<meanMax && meanIt.hasNext()) {
                    suggestion = meanIt.next();
View Full Code Here

        final String querystring =  originalquerystring.replace('+', ' ');
        final int timeout = (post == null) ? 300 : post.getInt("timeout", 300);
        final int count = (post == null) ? 20 : post.getInt("count", 20);
       
        // get segment
        final Segment indexSegment;
        if (post != null && post.containsKey("segment")) {
            String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            } else {
                // take default segment
                indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }
       
        int c = 0;
        if (more ||
                (indexSegment != null &&
                !indexSegment.termIndex().has(Word.word2hash(querystring))))
        {
            final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), querystring);
            final Iterator<String> meanIt = didYouMean.getSuggestions(timeout, count).iterator();
            String suggestion;
            //[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]]
            while (c < meanMax && meanIt.hasNext()) {
                suggestion = meanIt.next();
View Full Code Here

        final Switchboard sb = (Switchboard) env;
        final serverObjects prop = new serverObjects();
        File defaultSettingsFile = new File(sb.getAppPath(), "defaults/yacy.init");
       
        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }
       
        if(post != null) {
          if(post.containsKey("defaultFile")){
              // TODO check file-path!
              final File value = new File(sb.getAppPath(), post.get("defaultFile", "defaults/yacy.init"));
              // check if value is readable file
              if(value.exists() && value.isFile() && value.canRead()) {
                  defaultSettingsFile = value;
              }
          }
            if (post.containsKey("Xmx")) {
                int xmx = post.getInt("Xmx", 500); // default maximum heap size
                if (OS.isWin32) xmx = Math.min(2000, xmx);
                int xms = xmx / 4;
              sb.setConfig("javastart_Xmx", "Xmx" + xmx + "m");
              sb.setConfig("javastart_Xms", "Xms" + xms + "m");
              prop.put("setStartupCommit", "1");
            }
            if(post.containsKey("diskFree")) {
              sb.setConfig(SwitchboardConstants.DISK_FREE, post.getInt("diskFree", 3000));
            }
            if(post.containsKey("diskFreeHardlimit")) {
              sb.setConfig(SwitchboardConstants.DISK_FREE_HARDLIMIT, post.getInt("diskFreeHardlimit", 1000));
            }
            if(post.containsKey("memoryAcceptDHT")) {
              sb.setConfig(SwitchboardConstants.MEMORY_ACCEPTDHT, post.getInt("memoryAcceptDHT", 50));
            }
            if(post.containsKey("resetObserver")) {
              MemoryControl.setDHTallowed();
            }
        }
        final Map<String, String> defaultSettings = ((post == null) || (!(post.containsKey("submitdefault")))) ? null : FileUtils.loadMap(defaultSettingsFile);
        Iterator<String> threads = sb.threadNames();
        String threadName;
        BusyThread thread;
       
        final boolean xml = (header.get(HeaderFramework.CONNECTION_PROP_PATH)).endsWith(".xml");
        prop.setLocalized(!xml);
       
        // calculate totals
        long blocktime_total = 0, sleeptime_total = 0, exectime_total = 0;
        while (threads.hasNext()) {
            threadName = threads.next();
            thread = sb.getThread(threadName);
            blocktime_total += thread.getBlockTime();
            sleeptime_total += thread.getSleepTime();
            exectime_total += thread.getExecTime();
        }  
        if (blocktime_total == 0) blocktime_total = 1;
        if (sleeptime_total == 0) sleeptime_total = 1;
        if (exectime_total == 0) exectime_total = 1;
       
        // set templates for latest news from the threads
        long blocktime, sleeptime, exectime;
        long idlesleep, busysleep, memuse, memprereq;
        int queuesize;
        threads = sb.threadNames();
        int c = 0;
        long idleCycles, busyCycles, memshortageCycles;
        // set profile?
        final double multiplier = (post != null) && post.containsKey("profileSpeed") ? 100.0 / post.getFloat("profileSpeed", 100.0f) : 1.0;
        final boolean setProfile = (post != null && post.containsKey("submitdefault"));
        final boolean setDelay = (post != null) && (post.containsKey("submitdelay"));
        // save used settings file to config
        if (setProfile && post != null){
          sb.setConfig("performanceProfile", post.get("defaultFile", "defaults/yacy.init"));
          sb.setConfig("performanceSpeed", post.getInt("profileSpeed", 100));
        }
       
        while (threads.hasNext()) {
            threadName = threads.next();
            thread = sb.getThread(threadName);
           
            // set values to templates
            prop.put("table_" + c + "_threadname", threadName);

      prop.putHTML("table_" + c + "_hasurl_shortdescr", thread.getShortDescription());
      if(thread.getMonitorURL() == null) {
        prop.put("table_"+c+"_hasurl", "0");
      }else{
        prop.put("table_"+c+"_hasurl", "1");
        prop.put("table_" + c + "_hasurl_url", thread.getMonitorURL());
      }
            prop.putHTML("table_" + c + "_longdescr", thread.getLongDescription());
            queuesize = thread.getJobCount();
            prop.put("table_" + c + "_queuesize", (queuesize == Integer.MAX_VALUE) ? "unlimited" : Formatter.number(queuesize, !xml));
           
            blocktime = thread.getBlockTime();
            sleeptime = thread.getSleepTime();
            exectime = thread.getExecTime();
            memuse = thread.getMemoryUse();
            idleCycles = thread.getIdleCycles();
            busyCycles = thread.getBusyCycles();
            memshortageCycles = thread.getOutOfMemoryCycles();
            prop.putNum("table_" + c + "_blocktime", blocktime / 1000);
            prop.putNum("table_" + c + "_blockpercent", 100 * blocktime / blocktime_total);
            prop.putNum("table_" + c + "_sleeptime", sleeptime / 1000);
            prop.putNum("table_" + c + "_sleeppercent", 100 * sleeptime / sleeptime_total);
            prop.putNum("table_" + c + "_exectime", exectime / 1000);
            prop.putNum("table_" + c + "_execpercent", 100 * exectime / exectime_total);
            prop.putNum("table_" + c + "_totalcycles", idleCycles + busyCycles + memshortageCycles);
            prop.putNum("table_" + c + "_idlecycles", idleCycles);
            prop.putNum("table_" + c + "_busycycles", busyCycles);
            prop.putNum("table_" + c + "_memscycles", memshortageCycles);
            prop.putNum("table_" + c + "_sleeppercycle", ((idleCycles + busyCycles) == 0) ? -1 : sleeptime / (idleCycles + busyCycles));
            prop.putNum("table_" + c + "_execpercycle", (busyCycles == 0) ? -1 : exectime / busyCycles);
            prop.putNum("table_" + c + "_memusepercycle", (busyCycles == 0) ? -1 : memuse / busyCycles / 1024);
           
            // load with old values
            idlesleep = sb.getConfigLong(threadName + "_idlesleep" , 1000);
            busysleep = sb.getConfigLong(threadName + "_busysleep",   100);
            memprereq = sb.getConfigLong(threadName + "_memprereq",     0);
            if (setDelay && post != null) {
                // load with new values
                idlesleep = post.getLong(threadName + "_idlesleep", idlesleep);
                busysleep = post.getLong(threadName + "_busysleep", busysleep);
                memprereq = post.getLong(threadName + "_memprereq", memprereq) * 1024;
                if (memprereq == 0) memprereq = sb.getConfigLong(threadName + "_memprereq", 0);
                   
                // check values to prevent short-cut loops
                if (idlesleep < 1000) idlesleep = 1000;
                if (threadName.equals("10_httpd")) { idlesleep = 0; busysleep = 0; memprereq = 0; }
               
                sb.setThreadPerformance(threadName, idlesleep, busysleep, memprereq);
                idlesleep = sb.getConfigLong(threadName + "_idlesleep", idlesleep);
                busysleep = sb.getConfigLong(threadName + "_busysleep", busysleep);
            }
            if (setProfile) {
                if (threadName.equals(SwitchboardConstants.PEER_PING) ||
                    threadName.equals(SwitchboardConstants.SEED_UPLOAD) ||
                    threadName.equals(SwitchboardConstants.CLEANUP)) {
                    /* do not change any values */
                } else {
                    // load with new values
                    idlesleep = (long) (Long.parseLong(d(defaultSettings.get(threadName + "_idlesleep"), String.valueOf(idlesleep))) * multiplier);
                    busysleep = (long) (Long.parseLong(d(defaultSettings.get(threadName + "_busysleep"), String.valueOf(busysleep))) * multiplier);
                    //memprereq = (long) (Long.parseLong(d(defaultSettings.get(threadName + "_memprereq"), String.valueOf(memprereq))) * multiplier);

                    // check values to prevent short-cut loops
                    if (idlesleep < 1000) idlesleep = 1000;
                    if (threadName.equals("10_httpd")) { idlesleep = 0; busysleep = 0; memprereq = 0; }
                    //if (threadName.equals(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) && (busysleep < 50)) busysleep = 50;
                    sb.setThreadPerformance(threadName, idlesleep, busysleep, memprereq);
                }
            }
            prop.put("table_" + c + "_idlesleep", idlesleep);
            prop.put("table_" + c + "_busysleep", busysleep);
            prop.put("table_" + c + "_memprereq", memprereq / 1024);
            // disallow setting of memprereq for indexer to prevent db from throwing OOMs
            prop.put("table_" + c + "_disabled", /*(threadName.endsWith("_indexing")) ? 1 :*/ "0");
            prop.put("table_" + c + "_recommendation", threadName.endsWith("_indexing") ? "1" : "0");
            prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (indexSegment.termIndex().minMem() / 1024) : 0);
            c++;
        }
        prop.put("table", c);
       
        // performance profiles
        c = 0;
        final String usedfile = sb.getConfig("performanceProfile", "defaults/yacy.init");
        for(final String filename: performanceProfiles.keySet()) {
            prop.put("profile_" + c + "_filename", filename);
            prop.put("profile_" + c + "_description", performanceProfiles.get(filename));
            prop.put("profile_" + c + "_used", usedfile.equalsIgnoreCase(filename) ? "1" : "0");
            c++;
        }
        prop.put("profile", c);
       
        c = 0;
        final int[] speedValues = {200,150,100,50,25,10};
        final int usedspeed = sb.getConfigInt("performanceSpeed", 100);
        for(final int speed: speedValues){
          prop.put("speed_" + c + "_value", speed);
          prop.put("speed_" + c + "_label", speed + " %");
          prop.put("speed_" + c + "_used", (speed == usedspeed) ? "1" : "0");
          c++;
        }
        prop.put("speed", c);
       
        if ((post != null) && (post.containsKey("cacheSizeSubmit"))) {
            final int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 20000);
            sb.setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));
            indexSegment.termIndex().setBufferMaxWordCount(wordCacheMaxCount);
        }
       
        if ((post != null) && (post.containsKey("poolConfig"))) {
           
            /*
             * configuring the crawler pool
             */
            // get the current crawler pool configuration
            int maxBusy = post.getInt("Crawler Pool_maxActive", 8);
           
            // storing the new values into configfile
            sb.setConfig(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX,maxBusy);
            //switchboard.setConfig("crawler.MinIdleThreads",minIdle);
           
            /*
             * configuring the http pool
             */
            final WorkflowThread httpd = sb.getThread("10_httpd");
            try {
                maxBusy = post.getInt("httpd Session Pool_maxActive", 8);
            } catch (final NumberFormatException e) {
                maxBusy = 8;
            }

            ((serverCore)httpd).setMaxSessionCount(maxBusy);   
           
            // storing the new values into configfile
            sb.setConfig("httpdMaxBusySessions",maxBusy);

        }       
       
        if ((post != null) && (post.containsKey("PrioritySubmit"))) {
          sb.setConfig("javastart_priority",post.get("YaCyPriority","0"));
        }
       
        if ((post != null) && (post.containsKey("onlineCautionSubmit"))) {
            sb.setConfig(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseProxy", 30000)));
            sb.setConfig(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseLocalsearch", 30000)));
            sb.setConfig(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseRemotesearch", 30000)));
        }
       
        if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) {
            final long minimumLocalDelta = post.getLong("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
            final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
            sb.setConfig("minimumLocalDelta", minimumLocalDelta);
            sb.setConfig("minimumGlobalDelta", minimumGlobalDelta);
            sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
        }
       
        // delta settings
        prop.put("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
        prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
       
        // table cache settings
        prop.putNum("urlCacheSize", indexSegment.urlMetadata().writeCacheSize())
        prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize());
        prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024);
        prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences());
        prop.putNum("maxAgeOfCache", indexSegment.termIndex().getBufferMaxAge() / 1000 / 60); // minutes
        prop.putNum("minAgeOfCache", indexSegment.termIndex().getBufferMinAge() / 1000 / 60); // minutes
        prop.putNum("maxWaitingWordFlush", sb.getConfigLong("maxWaitingWordFlush", 180));
        prop.put("wordCacheMaxCount", sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000));
        prop.put("crawlPauseProxy", sb.getConfigLong(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, 30000));
        prop.put("crawlPauseLocalsearch", sb.getConfigLong(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, 30000));
        prop.put("crawlPauseRemotesearch", sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, 30000));
View Full Code Here

        QueryParams theQuery = null;
        SearchEvent theSearch = null;
        ArrayList<WeakPriorityBlockingQueue.Element<ResultEntry>> accu = null;
        if (query.length() == 0 && abstractSet != null) {
            // this is _not_ a normal search, only a request for index abstracts
            final Segment indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
            theQuery = new QueryParams(
                    null,
                    abstractSet,
                    new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0),
                    null,
                    snippetPattern,
                    null,
                    maxdist,
                    prefer,
                    ContentDomain.contentdomParser(contentdom),
                    language,
                    "", // no navigation
                    CacheStrategy.CACHEONLY,
                    count,
                    0,
                    filter,
                    QueryParams.SEARCHDOM_LOCAL,
                    -1,
                    null,
                    false,
                    sitehash,
                    authorhash,
                    DigestURI.TLD_any_zone_filter,
                    client,
                    false,
                    indexSegment,
                    rankingProfile,
                    header.get(RequestHeader.USER_AGENT, ""),
                    false
                    );
            yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");

            final long timer = System.currentTimeMillis();
            //final Map<byte[], ReferenceContainer<WordReference>>[] containers = sb.indexSegment.index().searchTerm(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2StringSet(urls));
            final TreeMap<byte[], ReferenceContainer<WordReference>> incc = indexSegment.termIndex().searchConjunction(theQuery.queryHashes, QueryParams.hashes2Handles(urls));

            EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.Type.COLLECTION, "", incc.size(), System.currentTimeMillis() - timer), false);
            if (incc != null) {
                final Iterator<Map.Entry<byte[], ReferenceContainer<WordReference>>> ci = incc.entrySet().iterator();
                Map.Entry<byte[], ReferenceContainer<WordReference>> entry;
View Full Code Here

      // return variable that accumulates replacements
        final Switchboard sb = (Switchboard) env;
        final serverObjects prop = new serverObjects();

        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }
       
        if (post == null) {
            prop.put("linkfreq", sb.getConfigLong("defaultLinkReceiveFrequency",30));
            prop.put("wordfreq", sb.getConfigLong("defaultWordReceiveFrequency",10));
            prop.put("dtable", "");
            prop.put("rtable", "");
            prop.putNum("wcount", indexSegment.termIndex().sizesMax());
            prop.putNum("ucount", indexSegment.urlMetadata().size());
            return prop; // be save
        }
       
        if (post.containsKey("indexsharesetting")) {
            sb.setConfig(SwitchboardConstants.INDEX_DIST_ALLOW, post.containsKey("distribute"));
            sb.setConfig("allowReceiveIndex", post.containsKey("receive"));
            sb.setConfig("defaultLinkReceiveFrequency", post.getInt("linkfreq", 30));
            sb.setConfig("defaultWordReceiveFrequency", post.getInt("wordfreq", 10));
        }

        // insert constants
        prop.putNum("wcount", indexSegment.termIndex().sizesMax());
        prop.putNum("ucount", indexSegment.urlMetadata().size());
       
        // return rewrite properties
        return prop;
    }
View Full Code Here

        final serverObjects prop = new serverObjects();
        final Switchboard sb = (Switchboard) env;
        prop.put("title", "DbCleanup_p");
       
        // get segment
        Segment indexSegment = null;
        if (post != null && post.containsKey("segment")) {
            String segmentName = post.get("segment");
            if (sb.indexSegments.segmentExist(segmentName)) {
                indexSegment = sb.indexSegments.segment(segmentName);
            }
        } else {
            // take default segment
            indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
        }
       
        if (post!=null) {
            if (post.get("action").equals("ustart")) {
                if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) {
                    urldbCleanerThread = indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist, sb.crawlStacker);
                    urldbCleanerThread.start();
                }
                else {
                    urldbCleanerThread.endPause();
                }
            }
            else if (post.get("action").equals("ustop") && (urldbCleanerThread!=null)) {
                urldbCleanerThread.abort();
            }
            else if (post.get("action").equals("upause") && (urldbCleanerThread!=null)) {
                urldbCleanerThread.pause();
            }
            else if (post.get("action").equals("rstart")) {
                if (indexCleanerThread==null || !indexCleanerThread.isAlive()) {
                    indexCleanerThread = indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes());
                    indexCleanerThread.start();
                }
                else {
                    indexCleanerThread.endPause();
                }
            }
            else if (post.get("action").equals("rstop") && (indexCleanerThread!=null)) {
                indexCleanerThread.abort();
            }
            else if (post.get("action").equals("rpause") && (indexCleanerThread!=null)) {
                indexCleanerThread.pause();
            }
            prop.put("LOCATION","");
            return prop;
        }
        if (urldbCleanerThread!=null) {
            prop.put("urldb", "1");
            prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/indexSegment.urlMetadata().size())*100);
            prop.putNum("urldb_blacklisted", urldbCleanerThread.blacklistedUrls);
            prop.putNum("urldb_total", urldbCleanerThread.totalSearchedUrls);
            prop.putHTML("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl);
            prop.put("urldb_lastBlacklistedHash", urldbCleanerThread.lastBlacklistedHash);
            prop.putHTML("urldb_lastUrl", urldbCleanerThread.lastUrl);
View Full Code Here

        for (final String s: sb.indexSegments.segmentNames()) {
            prop.put("segments_" + i + "_name", s);
            prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0);
            i++;
        }
        Segment segment = sb.indexSegments.segment(segmentName);
        prop.put("segments", i);
        prop.putNum("ucount", segment.urlMetadata().size());
        prop.put("otherHosts", "");
        prop.put("genUrlProfile", 0);
        prop.put("statistics", 1);
        prop.put("statistics_lines", 100);
        prop.put("statisticslines", 0);
        prop.put("reload", 0);

        // do segment selection
        if (post != null && post.containsKey("segment")) {
            // default values
            segmentName = post.get("segment", segmentName).trim();
            i= 0;
            for (final String s: sb.indexSegments.segmentNames()) {
                prop.put("segments_" + i + "_name", s);
                prop.put("segments_" + i + "_selected", (segmentName.equals(s)) ? 1 : 0);
                i++;
            }
            prop.put("segments", i);
            segment = sb.indexSegments.segment(segmentName);
        }

        // show export messages
        final MetadataRepository.Export export = segment.urlMetadata().export();
        if ((export != null) && (export.isAlive())) {
          // there is currently a running export
            prop.put("lurlexport", 2);
            prop.put("lurlexportfinished", 0);
        prop.put("lurlexporterror", 0);
        prop.put("lurlexport_exportfile", export.file().toString());
            prop.put("lurlexport_urlcount", export.count());
            prop.put("reload", 1);
        } else {
            prop.put("lurlexport", 1);
            prop.put("lurlexport_exportfile", sb.getDataPath() + "/DATA/EXPORT/" + GenericFormatter.SHORT_SECOND_FORMATTER.format());
            if (export == null) {
                // there has never been an export
                prop.put("lurlexportfinished", 0);
                prop.put("lurlexporterror", 0);
            } else {
                // an export was running but has finished
                prop.put("lurlexportfinished", 1);
                prop.put("lurlexportfinished_exportfile", export.file().toString());
                prop.put("lurlexportfinished_urlcount", export.count());
                if (export.failed() == null) {
                    prop.put("lurlexporterror", 0);
                } else {
                    prop.put("lurlexporterror", 1);
                    prop.put("lurlexporterror_exportfile", export.file().toString());
                    prop.put("lurlexporterror_exportfailmsg", export.failed());
                }
            }
        }

        if (post == null || env == null) {
            return prop; // nothing to do
        }

        // post values that are set on numerous input fields with same name
        String urlstring = post.get("urlstring", "").trim();
        String urlhash = post.get("urlhash", "").trim();

        if (!urlstring.startsWith("http://") &&
            !urlstring.startsWith("https://") &&
            !urlstring.startsWith("ftp://") &&
            !urlstring.startsWith("smb://") &&
            !urlstring.startsWith("file://")) { urlstring = "http://" + urlstring; }

        prop.putHTML("urlstring", urlstring);
        prop.putHTML("urlhash", urlhash);
        prop.put("result", " ");

        if (post.containsKey("urlhashdeleteall")) {
            i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, CacheStrategy.IFEXIST);
            prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urlhashdelete")) {
            final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
            if (entry == null) {
                prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
            } else {
                urlstring = entry.metadata().url().toNormalform(false, true);
                prop.put("urlstring", "");
                sb.urlRemove(segment, urlhash.getBytes());
                prop.putHTML("result", "Removed URL " + urlstring);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urldelete")) {
            try {
                urlhash = ASCII.String((new DigestURI(urlstring)).hash());
            } catch (final MalformedURLException e) {
                urlhash = null;
            }
            if ((urlhash == null) || (urlstring == null)) {
                prop.put("result", "No input given; nothing deleted.");
            } else {
                sb.urlRemove(segment, urlhash.getBytes());
                prop.putHTML("result", "Removed URL " + urlstring);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urlstringsearch")) {
            try {
                final DigestURI url = new DigestURI(urlstring);
                urlhash = ASCII.String(url.hash());
                prop.put("urlhash", urlhash);
                final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
                if (entry == null) {
                    prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true));
                    prop.putHTML("urlstring", urlstring);
                    prop.put("urlhash", "");
                } else {
                    prop.putAll(genUrlProfile(segment, entry, urlhash));
                    prop.put("statistics", 0);
                }
            } catch (final MalformedURLException e) {
                prop.putHTML("result", "bad url: " + urlstring);
                prop.put("urlhash", "");
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("urlhashsearch")) {
            final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
            if (entry == null) {
                prop.putHTML("result", "No Entry for URL hash " + urlhash);
            } else {
                prop.putHTML("urlstring", entry.metadata().url().toNormalform(false, true));
                prop.putAll(genUrlProfile(segment, entry, urlhash));
                prop.put("statistics", 0);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        // generate list
        if (post.containsKey("urlhashsimilar")) {
            try {
                final Iterator<URIMetadataRow> entryIt = new RotateIterator<URIMetadataRow>(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
                final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
                URIMetadataRow entry;
                i = 0;
                int rows = 0, cols = 0;
                prop.put("urlhashsimilar", "1");
                while (entryIt.hasNext() && i < 256) {
                    entry = entryIt.next();
                    if (entry == null) break;
                    prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", ASCII.String(entry.hash()));
                    cols++;
                    if (cols==8) {
                        prop.put("urlhashsimilar_rows_"+rows+"_cols", cols);
                        cols = 0;
                        rows++;
                    }
                    i++;
                }
                prop.put("statistics", 0);
                prop.put("urlhashsimilar_rows", rows);
                prop.put("result", result.toString());
            } catch (final IOException e) {
                prop.putHTML("result", "No Entries for URL hash " + urlhash);
            }
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        if (post.containsKey("lurlexport")) {
            // parse format
            int format = 0;
            final String fname = post.get("format", "url-text");
            final boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, otherwise only the domain
            if (fname.endsWith("text")) format = 0;
            if (fname.endsWith("html")) format = 1;
            if (fname.endsWith("rss")) format = 2;

            // extend export file name
      String s = post.get("exportfile", "");
      if (s.indexOf('.') < 0) {
        if (format == 0) s = s + ".txt";
        if (format == 1) s = s + ".html";
        if (format == 2) s = s + ".xml";
      }
          final File f = new File(s);
      f.getParentFile().mkdirs();
      final String filter = post.get("exportfilter", ".*");
      final MetadataRepository.Export running = segment.urlMetadata().export(f, filter, null, format, dom);

      prop.put("lurlexport_exportfile", s);
      prop.put("lurlexport_urlcount", running.count());
      if ((running != null) && (running.failed() == null)) {
        prop.put("lurlexport", 2);
      }
      prop.put("reload", 1);
        }

        if (post.containsKey("deletedomain")) {
            final String hp = post.get("hashpart");
            try {
                segment.urlMetadata().deleteDomain(hp);
            } catch (final IOException e) {
                // TODO Auto-generated catch block
                Log.logException(e);
            }
            // trigger the loading of the table
            post.put("statistics", "");
            prop.put("reload", 0);
        }

        if (post.containsKey("statistics")) {
            final int count = post.getInt("lines", 100);
            Iterator<MetadataRepository.HostStat> statsiter;
            prop.put("statistics_lines", count);
            int cnt = 0;
            try {
                final MetadataRepository metadata = segment.urlMetadata();
                statsiter = metadata.statistics(count, metadata.urlSampleScores(metadata.domainSampleCollector()));
                boolean dark = true;
                MetadataRepository.HostStat hs;
                while (statsiter.hasNext() && cnt < count) {
                    hs = statsiter.next();
                    prop.put("statisticslines_domains_" + cnt + "_dark", (dark) ? "1" : "0");
                    prop.put("statisticslines_domains_" + cnt + "_domain", hs.hostname + ((hs.port == 80) ? "" : ":" + hs.port));
                    prop.put("statisticslines_domains_" + cnt + "lines", count);
                    prop.put("statisticslines_domains_" + cnt + "_hashpart", hs.hosthash);
                    prop.put("statisticslines_domains_" + cnt + "_count", hs.count);
                    dark = !dark;
                    cnt++;
                }
            } catch (final IOException e) {
                Log.logException(e);
            }
            prop.put("statisticslines_domains", cnt);
            prop.put("statisticslines", 1);
            prop.put("lurlexport", 0);
            prop.put("reload", 0);
        }

        // insert constants
        prop.putNum("ucount", segment.urlMetadata().size());
        // return rewrite properties
        return prop;
    }
View Full Code Here

TOP

Related Classes of de.anomic.search.Segment$ReferenceCleaner

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.