Package de.anomic.search

Source Code of de.anomic.search.ReferenceOrder$NormalizeWorker

// ReferenceOrder.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 07.11.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2011-06-01 21:31:56 +0200 (Mi, 01. Jun 2011) $
// $LastChangedRevision: 7766 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package de.anomic.search;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;

import net.yacy.cora.storage.ConcurrentScoreMap;
import net.yacy.document.Condenser;
import net.yacy.document.LargeNumberCache;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.ByteBuffer;


public class ReferenceOrder {

    private static int cores = Runtime.getRuntime().availableProcessors();

    private       int maxdomcount;
    private       WordReferenceVars min, max;
    private final ConcurrentScoreMap<String> doms; // collected for "authority" heuristic
    private final RankingProfile ranking;
    private final byte[] language;

    public ReferenceOrder(final RankingProfile profile, final byte[] language) {
        this.min = null;
        this.max = null;
        this.ranking = profile;
        this.doms = new ConcurrentScoreMap<String>();
        this.maxdomcount = 0;
        this.language = language;
    }

    public BlockingQueue<WordReferenceVars> normalizeWith(final ReferenceContainer<WordReference> container) {
        final LinkedBlockingQueue<WordReferenceVars> out = new LinkedBlockingQueue<WordReferenceVars>();
        int threads = cores;
        if (container.size() < 100) threads = 2;
        final Thread distributor = new NormalizeDistributor(container, out, threads);
        distributor.start();
        try {
            distributor.join(10); // let the distributor work for at least 10 milliseconds
        } catch (final InterruptedException e) {
        }

        // return the resulting queue while the processing queues are still working
        return out;
    }

    private final class NormalizeDistributor extends Thread {

        ReferenceContainer<WordReference> container;
        LinkedBlockingQueue<WordReferenceVars> out;
        private final int threads;

        public NormalizeDistributor(final ReferenceContainer<WordReference> container, final LinkedBlockingQueue<WordReferenceVars> out, final int threads) {
            this.container = container;
            this.out = out;
            this.threads = threads;
        }

        @Override
        public void run() {
            // transform the reference container into a stream of parsed entries
            final BlockingQueue<WordReferenceVars> vars = WordReferenceVars.transform(this.container);

            // start the transformation threads
            final Semaphore termination = new Semaphore(this.threads);
            final NormalizeWorker[] worker = new NormalizeWorker[this.threads];
            for (int i = 0; i < this.threads; i++) {
                worker[i] = new NormalizeWorker(this.out, termination);
                worker[i].start();
            }

            // fill the queue
            WordReferenceVars iEntry;
            int p = 0;
            try {
                while ((iEntry = vars.take()) != WordReferenceVars.poison) {
                    worker[p % this.threads].add(iEntry);
                    p++;
                }
            } catch (final InterruptedException e) {
            }

            // insert poison to stop the queues
            for (int i = 0; i < this.threads; i++) worker[i].add(WordReferenceVars.poison);

            // wait for termination but not too long to make it possible that this
            // is called from outside with a join to get some normalization results
            // before going on
            for (int i = 0; i < this.threads; i++) try {worker[i].join(100);} catch (final InterruptedException e) {}
        }
    }

    /**
     * normalize ranking: find minimum and maximum of separate ranking criteria
     */
    private class NormalizeWorker extends Thread {

        private final BlockingQueue<WordReferenceVars> out;
        private final Semaphore termination;
        private final BlockingQueue<WordReferenceVars> decodedEntries;

        public NormalizeWorker(final BlockingQueue<WordReferenceVars> out, final Semaphore termination) {
            this.out = out;
            this.termination = termination;
            this.decodedEntries = new LinkedBlockingQueue<WordReferenceVars>();
        }

        public void add(final WordReferenceVars entry) {
            try {
                this.decodedEntries.put(entry);
            } catch (final InterruptedException e) {
            }
        }

        public void run() {
            try {
                WordReferenceVars iEntry;
                final Map<String, Integer> doms0 = new HashMap<String, Integer>();
                String dom;
                Integer count;
                final Integer int1 = 1;
                while ((iEntry = this.decodedEntries.take()) != WordReferenceVars.poison) {
                    // find min/max
                    if (ReferenceOrder.this.min == null) ReferenceOrder.this.min = iEntry.clone(); else ReferenceOrder.this.min.min(iEntry);
                    if (ReferenceOrder.this.max == null) ReferenceOrder.this.max = iEntry.clone(); else ReferenceOrder.this.max.max(iEntry);
                    this.out.put(iEntry); // must be after the min/max check to prevent that min/max is null in cardinal()
                    // update domcount
                    dom = iEntry.hosthash();
                    count = doms0.get(dom);
                    if (count == null) {
                        doms0.put(dom, int1);
                    } else {
                        doms0.put(dom, LargeNumberCache.valueOf(count.intValue() + 1));
                    }
                }

                // update domain score
                Map.Entry<String, Integer> entry;
                final Iterator<Map.Entry<String, Integer>> di = doms0.entrySet().iterator();
                while (di.hasNext()) {
                    entry = di.next();
                    ReferenceOrder.this.doms.inc(entry.getKey(), (entry.getValue()).intValue());
                }
                if (!ReferenceOrder.this.doms.isEmpty()) ReferenceOrder.this.maxdomcount = ReferenceOrder.this.doms.getMaxScore();
            } catch (final InterruptedException e) {
                Log.logException(e);
            } catch (final Exception e) {
                Log.logException(e);
            } finally {
                // insert poison to signal the termination to next queue
                try {
                    this.termination.acquire();
                    if (this.termination.availablePermits() == 0) this.out.put(WordReferenceVars.poison);
                } catch (final InterruptedException e) {}
            }
        }
    }

    public int authority(final String hostHash) {
        assert hostHash.length() == 6;
        return (this.doms.get(hostHash) << 8) / (1 + this.maxdomcount);
    }

    /**
     * return the ranking of a given word entry
     * @param t
     * @return a ranking: the higher the number, the better is the ranking
     */
    public long cardinal(final WordReferenceVars t) {
        //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
        // the normalizedEntry must be a normalized indexEntry
        final Bitfield flags = t.flags();
        assert this.min != null;
        assert this.max != null;
        assert t != null;
        assert this.ranking != null;
        final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency);
        //System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
        final int maxmaxpos = this.max.maxposition();
        final int minminpos = this.min.minposition();
        final long r =
             ((256 - DigestURI.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength)
           + ((this.ranking.coeff_ybr > 12) ? ((256 - (BlockRank.ranking(t.urlhash()) << 4)) << this.ranking.coeff_ybr) : 0)
           + ((this.max.urlcomps()      == this.min.urlcomps()   )   ? 0 : (256 - (((t.urlcomps()     - this.min.urlcomps()     ) << 8) / (this.max.urlcomps()     - this.min.urlcomps())     )) << this.ranking.coeff_urlcomps)
           + ((this.max.urllength()     == this.min.urllength()  )   ? 0 : (256 - (((t.urllength()    - this.min.urllength()    ) << 8) / (this.max.urllength()    - this.min.urllength())    )) << this.ranking.coeff_urllength)
                                        + ((maxmaxpos == minminpos) ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext)
           + ((this.max.posofphrase()   == this.min.posofphrase())   ? 0 : (256 - (((t.posofphrase()  - this.min.posofphrase()  ) << 8) / (this.max.posofphrase()  - this.min.posofphrase())  )) << this.ranking.coeff_posofphrase)
           + ((this.max.posinphrase()   == this.min.posinphrase())   ? 0 : (256 - (((t.posinphrase()  - this.min.posinphrase()  ) << 8) / (this.max.posinphrase()  - this.min.posinphrase())  )) << this.ranking.coeff_posinphrase)
           + ((this.max.distance()      == this.min.distance()   )   ? 0 : (256 - (((t.distance()     - this.min.distance()     ) << 8) / (this.max.distance()     - this.min.distance())     )) << this.ranking.coeff_worddistance)
           + ((this.max.virtualAge()    == this.min.virtualAge())    ? 0 :        (((t.virtualAge()   - this.min.virtualAge()   ) << 8) / (this.max.virtualAge()   - this.min.virtualAge())    ) << this.ranking.coeff_date)
           + ((this.max.wordsintitle()  == this.min.wordsintitle())  ? 0 : (((t.wordsintitle() - this.min.wordsintitle()  ) << 8) / (this.max.wordsintitle() - this.min.wordsintitle())  ) << this.ranking.coeff_wordsintitle)
           + ((this.max.wordsintext()   == this.min.wordsintext())   ? 0 : (((t.wordsintext()  - this.min.wordsintext()   ) << 8) / (this.max.wordsintext()  - this.min.wordsintext())   ) << this.ranking.coeff_wordsintext)
           + ((this.max.phrasesintext() == this.min.phrasesintext()) ? 0 : (((t.phrasesintext()- this.min.phrasesintext() ) << 8) / (this.max.phrasesintext()- this.min.phrasesintext()) ) << this.ranking.coeff_phrasesintext)
           + ((this.max.llocal()        == this.min.llocal())        ? 0 : (((t.llocal()       - this.min.llocal()        ) << 8) / (this.max.llocal()       - this.min.llocal())        ) << this.ranking.coeff_llocal)
           + ((this.max.lother()        == this.min.lother())        ? 0 : (((t.lother()       - this.min.lother()        ) << 8) / (this.max.lother()       - this.min.lother())        ) << this.ranking.coeff_lother)
           + ((this.max.hitcount()      == this.min.hitcount())      ? 0 : (((t.hitcount()     - this.min.hitcount()      ) << 8) / (this.max.hitcount()     - this.min.hitcount())      ) << this.ranking.coeff_hitcount)
           + tf
           + ((this.ranking.coeff_authority > 12) ? (authority(t.hosthash()) << this.ranking.coeff_authority) : 0)
           + ((flags.get(WordReferenceRow.flag_app_dc_identifier))  ? 255 << this.ranking.coeff_appurl             : 0)
           + ((flags.get(WordReferenceRow.flag_app_dc_title))       ? 255 << this.ranking.coeff_app_dc_title       : 0)
           + ((flags.get(WordReferenceRow.flag_app_dc_creator))     ? 255 << this.ranking.coeff_app_dc_creator     : 0)
           + ((flags.get(WordReferenceRow.flag_app_dc_subject))     ? 255 << this.ranking.coeff_app_dc_subject     : 0)
           + ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << this.ranking.coeff_app_dc_description : 0)
           + ((flags.get(WordReferenceRow.flag_app_emphasized))     ? 255 << this.ranking.coeff_appemph            : 0)
           + ((flags.get(Condenser.flag_cat_indexof))      ? 255 << this.ranking.coeff_catindexof         : 0)
           + ((flags.get(Condenser.flag_cat_hasimage))     ? 255 << this.ranking.coeff_cathasimage        : 0)
           + ((flags.get(Condenser.flag_cat_hasaudio))     ? 255 << this.ranking.coeff_cathasaudio        : 0)
           + ((flags.get(Condenser.flag_cat_hasvideo))     ? 255 << this.ranking.coeff_cathasvideo        : 0)
           + ((flags.get(Condenser.flag_cat_hasapp))       ? 255 << this.ranking.coeff_cathasapp          : 0)
           + ((ByteBuffer.equals(t.language, this.language)) ? 255 << this.ranking.coeff_language           : 0)
           + ((DigestURI.probablyRootURL(t.urlhash())) 15 << this.ranking.coeff_urllength          : 0);

        //if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;

        return r; // the higher the number the better the ranking.
    }

}
TOP

Related Classes of de.anomic.search.ReferenceOrder$NormalizeWorker

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.