Package de.anomic.search

Source Code of de.anomic.search.RankingProcess

// RankingProcess.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 07.11.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2011-06-23 17:39:52 +0200 (Do, 23. Jun 2011) $
// $LastChangedRevision: 7795 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package de.anomic.search;

import java.util.Comparator;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Scanner;
import net.yacy.cora.storage.ClusteredScoreMap;
import net.yacy.cora.storage.ConcurrentScoreMap;
import net.yacy.cora.storage.ScoreMap;
import net.yacy.cora.storage.WeakPriorityBlockingQueue;
import net.yacy.cora.storage.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.EventTracker;
import de.anomic.yacy.graphics.ProfilingGraph;

public final class RankingProcess extends Thread {

    private static final int maxDoubleDomAll = 1000, maxDoubleDomSpecial = 10000;

    private final QueryParams query;
    private final SortedSet<byte[]> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
    private final int[] flagcount; // flag counter
    private final SortedSet<byte[]> misses; // contains url-hashes that could not been found in the LURL-DB
    private       int sortout; // counter for referenced that had been sorted out for other reasons
    //private final int[] domZones;
    private SortedMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;

    private int remote_resourceSize, remote_indexCount, remote_peerCount;
    private int local_resourceSize, local_indexCount;
    private final WeakPriorityBlockingQueue<WordReferenceVars> stack;
    private int feeders;
    private final ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
    //private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process

    private final ScoreMap<String> ref;  // reference score computation for the commonSense heuristic
    private final Map<String, byte[]> hostResolver; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash
    private final ReferenceOrder order;
    private final long startTime;
    private       boolean addRunning;

    // navigation scores
    private final ScoreMap<String> hostNavigator; // a counter for the appearance of the host hash
    private final ScoreMap<String> authorNavigator; // a counter for the appearances of authors
    private final ScoreMap<String> namespaceNavigator; // a counter for name spaces
    private final ScoreMap<String> protocolNavigator; // a counter for protocol types
    private final ScoreMap<String> filetypeNavigator; // a counter for file types


    public RankingProcess(final QueryParams query, final ReferenceOrder order, final int maxentries) {
        // we collect the urlhashes and construct a list with urlEntry objects
        // attention: if minEntries is too high, this method will not terminate within the maxTime
        // sortorder: 0 = hash, 1 = url, 2 = ranking
        this.addRunning = true;
        this.localSearchInclusion = null;
        this.stack = new WeakPriorityBlockingQueue<WordReferenceVars>(maxentries);
        this.doubleDomCache = new ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>>();
        this.query = query;
        this.order = order;
        this.remote_peerCount = 0;
        this.remote_resourceSize = 0;
        this.remote_indexCount = 0;
        this.local_resourceSize = 0;
        this.local_indexCount = 0;
        this.urlhashes = new TreeSet<byte[]>(URIMetadataRow.rowdef.objectOrder);
        //this.urlhashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
        this.misses = new TreeSet<byte[]>(URIMetadataRow.rowdef.objectOrder);
        //this.misses = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
        this.sortout = 0;
        this.flagcount = new int[32];
        for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
        this.hostNavigator = new ConcurrentScoreMap<String>();
        this.hostResolver = new ConcurrentHashMap<String, byte[]>();
        this.authorNavigator = new ConcurrentScoreMap<String>();
        this.namespaceNavigator = new ConcurrentScoreMap<String>();
        this.protocolNavigator = new ConcurrentScoreMap<String>();
        this.filetypeNavigator = new ConcurrentScoreMap<String>();
        this.ref = new ConcurrentScoreMap<String>();
        this.feeders = 1;
        this.startTime = System.currentTimeMillis();
    }

    public QueryParams getQuery() {
        return this.query;
    }

    public ReferenceOrder getOrder() {
        return this.order;
    }

    @Override
    public void run() {
        // do a search

        // sort the local containers and truncate it to a limited count,
        // so following sortings together with the global results will be fast
        try {
            final long timer = System.currentTimeMillis();
            final TermSearch<WordReference> search = this.query.getSegment().termIndex().query(
                    this.query.queryHashes,
                    this.query.excludeHashes,
                    null,
                    Segment.wordReferenceFactory,
                    this.query.maxDistance);
            this.localSearchInclusion = search.inclusion();
            final ReferenceContainer<WordReference> index = search.joined();
            EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.JOIN, this.query.queryString, index.size(), System.currentTimeMillis() - timer), false);
            if (index.isEmpty()) {
                return;
            }

            add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, true);
        } catch (final Exception e) {
            Log.logException(e);
        } finally {
            oneFeederTerminated();
        }
    }

    public void add(
            final ReferenceContainer<WordReference> index,
            final boolean local,
            final String resourceName,
            final int fullResource,
            final boolean finalizeAddAtEnd) {
        // we collect the urlhashes and construct a list with urlEntry objects
        // attention: if minEntries is too high, this method will not terminate within the maxTime

        this.addRunning = true;

        assert (index != null);
        if (index.isEmpty()) return;

        if (local) {
            this.local_resourceSize += index.size();
        } else {
            assert fullResource >= 0 : "fullResource = " + fullResource;
            this.remote_resourceSize += fullResource;
            this.remote_peerCount++;
        }

        long timer = System.currentTimeMillis();

        // normalize entries
        final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
        EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.NORMALIZING, resourceName, index.size(), System.currentTimeMillis() - timer), false);

        // iterate over normalized entries and select some that are better than currently stored
        timer = System.currentTimeMillis();
        final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;

        // apply all constraints
        try {
            WordReferenceVars iEntry;
            final String pattern = this.query.urlMask.pattern();
            final boolean httpPattern = pattern.equals("http://.*");
            final boolean noHttpButProtocolPattern = pattern.equals("https://.*") || pattern.equals("ftp://.*") || pattern.equals("smb://.*") || pattern.equals("file://.*");
            pollloop: while (true) {
                iEntry = decodedEntries.poll(1, TimeUnit.SECONDS);
                if (iEntry == null || iEntry == WordReferenceVars.poison) break pollloop;
                assert (iEntry.urlhash().length == index.row().primaryKeyLength);
                //if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;

                // increase flag counts
                for (int j = 0; j < 32; j++) {
                    if (iEntry.flags().get(j)) {this.flagcount[j]++;}
                }

                // check constraints
                if (!testFlags(iEntry)) {
                    continue pollloop;
                }

                // check document domain
                if (this.query.contentdom != ContentDomain.TEXT) {
                    if ((this.query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue pollloop; }
                    if ((this.query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue pollloop; }
                    if ((this.query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue pollloop; }
                    if ((this.query.contentdom == ContentDomain.APP  ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp  )))) { continue pollloop; }
                }

                // check tld domain
                /*
                if ((DigestURI.domDomain(iEntry.metadataHash()) & this.query.zonecode) == 0) {
                    // filter out all tld that do not match with wanted tld domain
                    this.sortout++;
                    continue;
                }
                */

                // count domZones
                //this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;

                // check site constraints
                final String hosthash = iEntry.hosthash();
                if (this.query.sitehash == null) {
                    // no site constraint there; maybe collect host navigation information
                    if (nav_hosts && this.query.urlMask_isCatchall) {
                        this.hostNavigator.inc(hosthash);
                        this.hostResolver.put(hosthash, iEntry.urlhash());
                    }
                } else {
                    if (!hosthash.equals(this.query.sitehash)) {
                        // filter out all domains that do not match with the site constraint
                        continue pollloop;
                    }
                }

                // check protocol
                if (!this.query.urlMask_isCatchall) {
                    final boolean httpFlagSet = DigestURI.flag4HTTPset(iEntry.urlHash);
                    if (httpPattern && !httpFlagSet) continue pollloop;
                    if (noHttpButProtocolPattern && httpFlagSet) continue pollloop;
                }

                // finally make a double-check and insert result to stack
                if (this.urlhashes.add(iEntry.urlhash())) {
                    rankingtryloop: while (true) {
                        try {
                            this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
                            break rankingtryloop;
                        } catch (final ArithmeticException e) {
                            // this may happen if the concurrent normalizer changes values during cardinal computation
                            continue rankingtryloop;
                        }
                    }
                    // increase counter for statistics
                    if (local) this.local_indexCount++; else this.remote_indexCount++;
                }
            }

        } catch (final InterruptedException e) {} finally {
            if (finalizeAddAtEnd) this.addRunning = false;
        }

        //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
        EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.PRESORT, resourceName, index.size(), System.currentTimeMillis() - timer), false);
    }

    /**
     * method to signal the incoming stack that one feeder has terminated
     */
    public void oneFeederTerminated() {
      this.feeders--;
      assert this.feeders >= 0 : "feeders = " + this.feeders;
    }

    protected void moreFeeders(final int countMoreFeeders) {
      this.feeders += countMoreFeeders;
    }

    public boolean feedingIsFinished() {
      return System.currentTimeMillis() - this.startTime > 50 && this.feeders == 0;
    }

    private boolean testFlags(final WordReference ientry) {
        if (this.query.constraint == null) return true;
        // test if ientry matches with filter
        // if all = true: let only entries pass that has all matching bits
        // if all = false: let all entries pass that has at least one matching bit
        if (this.query.allofconstraint) {
            for (int i = 0; i < 32; i++) {
                if ((this.query.constraint.get(i)) && (!ientry.flags().get(i))) return false;
            }
            return true;
        }
        for (int i = 0; i < 32; i++) {
            if ((this.query.constraint.get(i)) && (ientry.flags().get(i))) return true;
        }
        return false;
    }

    protected Map<byte[], ReferenceContainer<WordReference>> searchContainerMap() {
        // direct access to the result maps is needed for abstract generation
        // this is only available if execQuery() was called before
        return this.localSearchInclusion;
    }

    private WeakPriorityBlockingQueue.Element<WordReferenceVars> takeRWI(final boolean skipDoubleDom, final long waitingtime) {

        // returns from the current RWI list the best entry and removes this entry from the list
        WeakPriorityBlockingQueue<WordReferenceVars> m;
        WeakPriorityBlockingQueue.Element<WordReferenceVars> rwi = null;

        // take one entry from the stack if there are entries on that stack or the feeding is not yet finished
        try {
            //System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue());
            int loops = 0; // a loop counter to terminate the reading if all the results are from the same domain
            final long timeout = System.currentTimeMillis() + waitingtime;
            while (((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) &&
                   (this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage)) {
                if (waitingtime <= 0) {
                    rwi = this.stack.poll();
                } else timeoutloop:while (System.currentTimeMillis() < timeout) {
                    if (feedingIsFinished() && this.stack.sizeQueue() == 0) break timeoutloop;
                    rwi = this.stack.poll(50);
                    if (rwi != null) break timeoutloop;
                }
                if (rwi == null) break;
                if (!skipDoubleDom) {
                    //System.out.println("!skipDoubleDom");
                    return rwi;
                 }

                // check doubledom
                final String hosthash = rwi.getElement().hosthash();
                synchronized (this.doubleDomCache) {
                    m = this.doubleDomCache.get(hosthash);
                    if (m == null) {
                        // first appearance of dom. we create an entry to signal that one of that domain was already returned
                        m = new WeakPriorityBlockingQueue<WordReferenceVars>((this.query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
                        this.doubleDomCache.put(hosthash, m);
                        return rwi;
                    }
                    // second appearances of dom
                    m.put(rwi);
                }
            }
        } catch (final InterruptedException e1) {}
        if (this.doubleDomCache.isEmpty()) return null;

        // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
        // find best entry from all caches
        WeakPriorityBlockingQueue.Element<WordReferenceVars> bestEntry = null;
        WeakPriorityBlockingQueue.Element<WordReferenceVars> o;
        synchronized (this.doubleDomCache) {
            final Iterator<WeakPriorityBlockingQueue<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
            while (i.hasNext()) {
                try {
                    m = i.next();
                } catch (final ConcurrentModificationException e) {
                    Log.logException(e);
                    continue; // not the best solution...
                }
                if (m == null) continue;
                if (m.isEmpty()) continue;
                if (bestEntry == null) {
                    bestEntry = m.peek();
                    continue;
                }
                o = m.peek();
                if (o == null) continue;
                if (o.getWeight() < bestEntry.getWeight()) {
                    bestEntry = o;
                }
            }
            if (bestEntry == null) return null;

            // finally remove the best entry from the doubledom cache
            m = this.doubleDomCache.get(bestEntry.getElement().hosthash());
            bestEntry = m.poll();
        }
        return bestEntry;
    }

    /**
     * get one metadata entry from the ranked results. This will be the 'best' entry so far
     * according to the applied ranking. If there are no more entries left or the timeout
     * limit is reached then null is returned. The caller may distinguish the timeout case
     * from the case where there will be no more also in the future by calling this.feedingIsFinished()
     * @param skipDoubleDom should be true if it is wanted that double domain entries are skipped
     * @param waitingtime the time this method may take for a result computation
     * @return a metadata entry for a url
     */
    public URIMetadataRow takeURL(final boolean skipDoubleDom, final long waitingtime) {
        // returns from the current RWI list the best URL entry and removes this entry from the list
      final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime);
      int p = -1;
      long timeleft;
      while ((timeleft = timeout - System.currentTimeMillis()) > 0) {
          //System.out.println("timeleft = " + timeleft);
            final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi = takeRWI(skipDoubleDom, timeleft);
            if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
            final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi);
            if (page == null) {
              this.misses.add(obrwi.getElement().urlhash());
              continue;
            }

            // prepare values for constraint check
            final URIMetadataRow.Components metadata = page.metadata();

            // check errors
            if (metadata == null) {
                this.sortout++;
                continue; // rare case where the url is corrupted
            }

            if (!this.query.urlMask_isCatchall) {
                // check url mask
                if (!metadata.matches(this.query.urlMask)) {
                    this.sortout++;
                    continue;
                }

                // in case that we do not have e catchall filter for urls
                // we must also construct the domain navigator here
                //if (query.sitehash == null) {
                //    this.hostNavigator.inc(UTF8.String(urlhash, 6, 6));
                //    this.hostResolver.put(UTF8.String(urlhash, 6, 6), UTF8.String(urlhash));
                //}
            }

            // check for more errors
            if (metadata.url() == null) {
                this.sortout++;
                continue; // rare case where the url is corrupted
            }

            final String pageurl = metadata.url().toNormalform(true, true);
            final String pageauthor = metadata.dc_creator();
            final String pagetitle = metadata.dc_title().toLowerCase();

            // check exclusion
            if ((QueryParams.anymatch(pagetitle, this.query.excludeHashes)) ||
                (QueryParams.anymatch(pageurl.toLowerCase(), this.query.excludeHashes)) ||
                (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.excludeHashes))) {
                this.sortout++;
                continue;
            }

            // check index-of constraint
            if ((this.query.constraint != null) &&
                (this.query.constraint.get(Condenser.flag_cat_indexof)) &&
                (!(pagetitle.startsWith("index of")))) {
                final Iterator<byte[]> wi = this.query.queryHashes.iterator();
                while (wi.hasNext()) {
                    this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash());
                }
                this.sortout++;
                continue;
            }

            // check location constraint
            if ((this.query.constraint != null) &&
                (this.query.constraint.get(Condenser.flag_cat_haslocation)) &&
                (metadata.lat() == 0.0f || metadata.lon() == 0.0f)) {
                this.sortout++;
                continue;
            }

            // check content domain
            if ((this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) ||
                (this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) ||
                (this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0) ||
                (this.query.contentdom == ContentDomain.APP && page.lapp() == 0)) {
                this.sortout++;
              continue;
            }

            // evaluate information of metadata for navigation
            // author navigation:
            if (pageauthor != null && pageauthor.length() > 0) {
              // add author to the author navigator
                final String authorhash = ASCII.String(Word.word2hash(pageauthor));

                // check if we already are filtering for authors
              if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) {
                    this.sortout++;
                continue;
              }

              // add author to the author navigator
                this.authorNavigator.inc(pageauthor);
            } else if (this.query.authorhash != null) {
                this.sortout++;
              continue;
            }

            // namespace navigation
            String pagepath = metadata.url().getPath();
            if ((p = pagepath.indexOf(':')) >= 0) {
                pagepath = pagepath.substring(0,p);
                p = pagepath.lastIndexOf('/');
                if (p >= 0) {
                    pagepath = pagepath.substring(p + 1);
                    this.namespaceNavigator.inc(pagepath);
                }
            }

            // protocol navigation
            final String protocol = metadata.url().getProtocol();
            this.protocolNavigator.inc(protocol);

            // file type navigation
            final String fileext = metadata.url().getFileExtension();
            if (fileext.length() > 0) this.filetypeNavigator.inc(fileext);

            // check Scanner
            if (!Scanner.acceptURL(metadata.url())) {
                this.sortout++;
                continue;
            }

            // accept url
            return page;
        }
        return null;
    }

    public int sizeQueue() {
        int c = this.stack.sizeQueue();
        for (final WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
            c += s.sizeQueue();
        }
        return c;
    }

    public int sizeAvailable() {
        int c = this.stack.sizeAvailable();
        for (final WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
            c += s.sizeAvailable();
        }
        return c;
    }

    public boolean isEmpty() {
        if (!this.stack.isEmpty()) return false;
        for (final WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
            if (!s.isEmpty()) return false;
        }
        return true;
    }

    public int[] flagCount() {
      return this.flagcount;
    }

    // "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."

    public int filteredCount() {
        // the number of index entries that are considered as result set
        return this.stack.sizeAvailable();
    }

    public int getLocalIndexCount() {
        // the number of results in the local peer after filtering
        return this.local_indexCount;
    }

    public int getRemoteIndexCount() {
        // the number of result contributions from all the remote peers
        return this.remote_indexCount;
    }

    public int getRemoteResourceSize() {
        // the number of all hits in all the remote peers
        return Math.max(this.remote_resourceSize, this.remote_indexCount);
    }

    public int getRemotePeerCount() {
        // the number of remote peers that have contributed
        return this.remote_peerCount;
    }

    public Iterator<byte[]> miss() {
        return this.misses.iterator();
    }

    public int getMissCount() {
        return this.misses.size();
    }

    public int getSortOutCount() {
        return this.sortout;
    }

    public ScoreMap<String> getNamespaceNavigator() {
        if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace") < 0) return new ClusteredScoreMap<String>();
        if (this.namespaceNavigator.sizeSmaller(2)) this.namespaceNavigator.clear(); // navigators with one entry are not useful
        return this.namespaceNavigator;
    }

    public ScoreMap<String> getHostNavigator() {
        final ScoreMap<String> result = new ConcurrentScoreMap<String>();
        if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return result;

        final Iterator<String> domhashs = this.hostNavigator.keys(false);
        URIMetadataRow row;
        byte[] urlhash;
        String hosthash, hostname;
        if (this.hostResolver != null) while (domhashs.hasNext() && result.sizeSmaller(30)) {
            hosthash = domhashs.next();
            if (hosthash == null) continue;
            urlhash = this.hostResolver.get(hosthash);
            row = urlhash == null ? null : this.query.getSegment().urlMetadata().load(urlhash);
            hostname = row == null ? null : row.metadata().url().getHost();
            if (hostname != null) {
                result.set(hostname, this.hostNavigator.get(hosthash));
            }
        }
        if (result.sizeSmaller(2)) result.clear(); // navigators with one entry are not useful
        return result;
    }

    public ScoreMap<String> getProtocolNavigator() {
        if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("protocol") < 0) return new ClusteredScoreMap<String>();
        if (this.protocolNavigator.sizeSmaller(2)) this.protocolNavigator.clear(); // navigators with one entry are not useful
        return this.protocolNavigator;
    }

    public ScoreMap<String> getFiletypeNavigator() {
        if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("filetype") < 0) return new ClusteredScoreMap<String>();
        if (this.filetypeNavigator.sizeSmaller(2)) this.filetypeNavigator.clear(); // navigators with one entry are not useful
        return this.filetypeNavigator;
    }

    public static final Comparator<Map.Entry<String, Integer>> mecomp = new Comparator<Map.Entry<String, Integer>>() {
        public int compare(final Map.Entry<String, Integer> o1, final Map.Entry<String, Integer> o2) {
            if (o1.getValue().intValue() < o2.getValue().intValue()) return 1;
            if (o2.getValue().intValue() < o1.getValue().intValue()) return -1;
            return 0;
        }
    };

    public ScoreMap<String> getTopicNavigator(final int count) {
        // create a list of words that had been computed by statistics over all
        // words that appeared in the url or the description of all urls
        final ScoreMap<String> result = new ConcurrentScoreMap<String>();
        if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("topics") < 0) return result;
        if (this.ref.sizeSmaller(2)) this.ref.clear(); // navigators with one entry are not useful
        final Map<String, Float> counts = new HashMap<String, Float>();
        final Iterator<String> i = this.ref.keys(false);
        String word;
        byte[] termHash;
        int c;
        float q, min = Float.MAX_VALUE, max = Float.MIN_VALUE;
        int ic = count;
        while (ic-- > 0 && i.hasNext()) {
            word = i.next();
            if (word == null) continue;
            termHash = Word.word2hash(word);
            c = this.query.getSegment().termIndex().count(termHash);
            if (c > 0) {
                q = ((float) this.ref.get(word)) / ((float) c);
                min = Math.min(min, q);
                max = Math.max(max, q);
                counts.put(word, q);
            }
        }
        if (max > min) for (final Map.Entry<String, Float> ce: counts.entrySet()) {
            result.set(ce.getKey(), (int) (((double) count) * (ce.getValue() - min) / (max - min)));
        }
        return this.ref;
    }

    private final static Pattern lettermatch = Pattern.compile("[a-z]+");

    public void addTopic(final String[] words) {
        String word;
        for (final String w : words) {
            word = w.toLowerCase();
            if (word.length() > 2 &&
                "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off".indexOf(word) < 0 &&
                !this.query.queryHashes.has(Word.word2hash(word)) &&
                lettermatch.matcher(word).matches() &&
                !Switchboard.badwords.contains(word) &&
                !Switchboard.stopwords.contains(word)) {
                this.ref.inc(word);
            }
        }
    }

    protected void addTopics(final ResultEntry resultEntry) {
        // take out relevant information for reference computation
        if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
        //final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
        final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description

        // add references
        //addTopic(urlcomps);
        addTopic(descrcomps);
    }

    public ScoreMap<String> getAuthorNavigator() {
        // create a list of words that had been computed by statistics over all
        // words that appeared in the url or the description of all urls
        if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ConcurrentScoreMap<String>();
        if (this.authorNavigator.sizeSmaller(2)) this.authorNavigator.clear(); // navigators with one entry are not useful
        return this.authorNavigator;
    }

}
TOP

Related Classes of de.anomic.search.RankingProcess

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.