Package de.zib.scalaris.examples.wikipedia.data.xml

Source Code of de.zib.scalaris.examples.wikipedia.data.xml.WikiDumpToScalarisHandler$MyScalarisSingleRunnable

/**
*  Copyright 2007-2011 Zuse Institute Berlin
*
*   Licensed under the Apache License, Version 2.0 (the "License");
*   you may not use this file except in compliance with the License.
*   You may obtain a copy of the License at
*
*       http://www.apache.org/licenses/LICENSE-2.0
*
*   Unless required by applicable law or agreed to in writing, software
*   distributed under the License is distributed on an "AS IS" BASIS,
*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*   See the License for the specific language governing permissions and
*   limitations under the License.
*/
package de.zib.scalaris.examples.wikipedia.data.xml;

import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import de.zib.scalaris.AbortException;
import de.zib.scalaris.Connection;
import de.zib.scalaris.ConnectionException;
import de.zib.scalaris.ConnectionFactory;
import de.zib.scalaris.TimeoutException;
import de.zib.scalaris.Transaction;
import de.zib.scalaris.TransactionSingleOp;
import de.zib.scalaris.UnknownException;
import de.zib.scalaris.examples.wikipedia.ScalarisDataHandler;
import de.zib.scalaris.examples.wikipedia.ValueResult;
import de.zib.scalaris.examples.wikipedia.data.Page;
import de.zib.scalaris.examples.wikipedia.data.Revision;
import de.zib.scalaris.examples.wikipedia.data.ShortRevision;
import de.zib.scalaris.examples.wikipedia.data.SiteInfo;

/**
* Provides abilities to read an xml wiki dump file and write its contents to
* the Scalaris.
*
* @author Nico Kruber, kruber@zib.de
*/
public class WikiDumpToScalarisHandler extends WikiDumpPageHandler {
    private static final int MAX_SCALARIS_CONNECTIONS = Runtime.getRuntime().availableProcessors() * 2;
    private ArrayBlockingQueue<TransactionSingleOp> scalaris_single = new ArrayBlockingQueue<TransactionSingleOp>(MAX_SCALARIS_CONNECTIONS);
    private ArrayBlockingQueue<Transaction> scalaris_tx = new ArrayBlockingQueue<Transaction>(MAX_SCALARIS_CONNECTIONS);
    private ExecutorService executor = Executors.newFixedThreadPool(MAX_SCALARIS_CONNECTIONS);

    /**
     * Sets up a SAX XmlHandler exporting all parsed pages except the ones in a
     * blacklist to stdout.
     *
     * @param blacklist
     *            a number of page titles to ignore
     * @param whitelist
     *            only import these pages
     * @param maxRevisions
     *            maximum number of revisions per page (starting with the most
     *            recent) - <tt>-1/tt> imports all revisions
     *            (useful to speed up the import / reduce the DB size)
     * @param minTime
     *            minimum time a revision should have (only one revision older
     *            than this will be imported) - <tt>null/tt> imports all
     *            revisions
     * @param maxTime
     *            maximum time a revision should have (newer revisions are
     *            omitted) - <tt>null/tt> imports all revisions
     *            (useful to create dumps of a wiki at a specific point in time)
     *
     * @throws RuntimeException
     *             if the connection to Scalaris fails
     */
    public WikiDumpToScalarisHandler(Set<String> blacklist,
            Set<String> whitelist, int maxRevisions, Calendar minTime,
            Calendar maxTime) throws RuntimeException {
        super(blacklist, whitelist, maxRevisions, minTime, maxTime);
        init(ConnectionFactory.getInstance());
    }

    /**
     * Sets up a SAX XmlHandler exporting all parsed pages except the ones in a
     * blacklist to stdout.
     *
     * @param blacklist
     *            a number of page titles to ignore
     * @param whitelist
     *            only import these pages
     * @param maxRevisions
     *            maximum number of revisions per page (starting with the most
     *            recent) - <tt>-1/tt> imports all revisions
     *            (useful to speed up the import / reduce the DB size)
     * @param minTime
     *            minimum time a revision should have (only one revision older
     *            than this will be imported) - <tt>null/tt> imports all
     *            revisions
     * @param maxTime
     *            maximum time a revision should have (newer revisions are
     *            omitted) - <tt>null/tt> imports all revisions
     *            (useful to create dumps of a wiki at a specific point in time)
     * @param cFactory
     *            the connection factory to use for creating new connections
     *
     * @throws RuntimeException
     *             if the connection to Scalaris fails
     */
    public WikiDumpToScalarisHandler(Set<String> blacklist,
            Set<String> whitelist, int maxRevisions, Calendar maxTime,
            Calendar minTime, ConnectionFactory cFactory)
            throws RuntimeException {
        super(blacklist, whitelist, maxRevisions, minTime, maxTime);
        init(cFactory);
    }

    /**
     * Sets up connections to Scalaris.
     *
     * @param cFactory
     *            the connection factory to use for creating new connections
     *
     * @throws RuntimeException
     *             if the connection to Scalaris fails
     */
    private void init(ConnectionFactory cFactory) throws RuntimeException {
        try {
            for (int i = 0; i < MAX_SCALARIS_CONNECTIONS; ++i) {
                Connection connection = cFactory.createConnection(
                        "wiki_import", true);
                scalaris_single.put(new TransactionSingleOp(connection));
                connection = cFactory.createConnection(
                        "wiki_import", true);
                scalaris_tx.put(new Transaction(connection));
            }
        } catch (ConnectionException e) {
            System.err.println("Connection to Scalaris failed");
            throw new RuntimeException(e);
        } catch (InterruptedException e) {
            System.err.println("Interrupted while setting up multiple connections to Scalaris");
            throw new RuntimeException(e);
        }
    }

    /**
     * @param siteinfo
     * @throws RuntimeException
     */
    @Override
    protected void doExport(SiteInfo siteinfo) throws RuntimeException {
        String key = ScalarisDataHandler.getSiteInfoKey();
        TransactionSingleOp scalaris_single;
        try {
            scalaris_single = this.scalaris_single.take();
        } catch (InterruptedException e) {
            System.err.println("write of " + key + " interrupted while getting connection to Scalaris");
            throw new RuntimeException(e);
        }
        try {
            scalaris_single.write(key, siteinfo);
        } catch (ConnectionException e) {
            System.err.println("write of " + key + " failed with connection error");
        } catch (TimeoutException e) {
            System.err.println("write of " + key + " failed with timeout");
        } catch (AbortException e) {
            System.err.println("write of " + key + " failed with abort");
        } catch (UnknownException e) {
            System.err.println("write of " + key + " failed with unknown");
        }
        if (scalaris_single != null) {
            try {
                this.scalaris_single.put(scalaris_single);
            } catch (InterruptedException e) {
                System.err.println("Interrupted while putting back a connection to Scalaris");
                throw new RuntimeException(e);
            }
        }
    }
   
    /**
     * @param page
     * @param revisions
     * @param revisions_short
     * @throws UnsupportedOperationException
     */
    @Override
    protected void doExport(Page page, List<Revision> revisions,
            List<ShortRevision> revisions_short)
            throws UnsupportedOperationException {
        // do not make the translog too full -> write revisions beforehand,
        // ignore the (rest of the) page if a failure occured
        TransactionSingleOp.RequestList requests = new TransactionSingleOp.RequestList();
        for (Revision rev : revisions) {
            String key = ScalarisDataHandler.getRevKey(page.getTitle(), rev.getId(), wikiModel.getNamespace());
            requests.addWrite(key, rev);
        }
        requests.addWrite(ScalarisDataHandler.getRevListKey(page.getTitle(), wikiModel.getNamespace()), revisions_short);
        requests.addWrite(ScalarisDataHandler.getPageKey(page.getTitle(), wikiModel.getNamespace()), page);
        Runnable worker = new MyScalarisSingleRunnable(requests, scalaris_single, "revisions and page of " + page.getTitle());
        executor.execute(worker);
        newPages.add(wikiModel.normalisePageTitle(page.getTitle()));
        // simple article filter: only pages in main namespace:
        if (wikiModel.getNamespace(page.getTitle()).isEmpty()) {
            newArticles.add(wikiModel.normalisePageTitle(page.getTitle()));
        }
        // only export page list every UPDATE_PAGELIST_EVERY pages:
        if ((newPages.size() % UPDATE_PAGELIST_EVERY) == 0) {
            updatePageLists();
        }
    }

    /* (non-Javadoc)
     * @see de.zib.scalaris.examples.wikipedia.data.xml.WikiDumpHandler#tearDown()
     */
    @Override
    public void tearDown() {
        super.tearDown();
        updatePageLists();
        importEnd();
    }
   
    private void updatePageLists() {
        String scalaris_key;
        Runnable worker;
       
        // list of pages:
        scalaris_key = ScalarisDataHandler.getPageListKey();
        worker = new MyScalarisAddToPageListRunnable(scalaris_key, newPages, scalaris_tx, ScalarisDataHandler.getPageCountKey());
        executor.execute(worker);
        newPages = new ArrayList<String>(UPDATE_PAGELIST_EVERY);
       
        // list of articles:
        scalaris_key = ScalarisDataHandler.getArticleListKey();
        worker = new MyScalarisAddToPageListRunnable(scalaris_key, newArticles, scalaris_tx, ScalarisDataHandler.getArticleCountKey());
        executor.execute(worker);
        newArticles = new ArrayList<String>(UPDATE_PAGELIST_EVERY);
       
        // list of pages in each category:
        for (Entry<String, List<String>> category: newCategories.entrySet()) {
            final String catName = category.getKey();
            scalaris_key = ScalarisDataHandler.getCatPageListKey(catName, wikiModel.getNamespace());
            worker = new MyScalarisAddToPageListRunnable(scalaris_key, category.getValue(), scalaris_tx, ScalarisDataHandler.getCatPageCountKey(catName, wikiModel.getNamespace()));
            executor.execute(worker);
        }
        newCategories = new HashMap<String, List<String>>(NEW_CATS_HASH_DEF_SIZE);

        // list of pages a template is used in:
        for (Entry<String, List<String>> template: newTemplates.entrySet()) {
            scalaris_key = ScalarisDataHandler.getTplPageListKey(template.getKey(), wikiModel.getNamespace());
            worker = new MyScalarisAddToPageListRunnable(scalaris_key, template.getValue(), scalaris_tx);
            executor.execute(worker);
        }
        newTemplates = new HashMap<String, List<String>>(NEW_TPLS_HASH_DEF_SIZE);
       
        // list of pages linking to other pages:
        for (Entry<String, List<String>> backlinks: newBackLinks.entrySet()) {
            scalaris_key = ScalarisDataHandler.getBackLinksPageListKey(backlinks.getKey(), wikiModel.getNamespace());
            worker = new MyScalarisAddToPageListRunnable(scalaris_key, backlinks.getValue(), scalaris_tx);
            executor.execute(worker);
        }
        newBackLinks = new HashMap<String, List<String>>(NEW_BLNKS_HASH_DEF_SIZE);
       
        executor.shutdown();
        boolean shutdown = false;
        while (!shutdown) {
            try {
                shutdown = executor.awaitTermination(1, TimeUnit.MINUTES);
            } catch (InterruptedException e) {
            }
        }
        executor = Executors.newFixedThreadPool(MAX_SCALARIS_CONNECTIONS);
    }

    /**
     * Processes write requests to Scalaris in a separate thread. Takes one of
     * the available {@link #scalaris_single} connections.
     *
     * @author Nico Kruber, kruber@zib.de
     */
    static class MyScalarisSingleRunnable implements Runnable {
        private TransactionSingleOp.RequestList requests;
        private ArrayBlockingQueue<TransactionSingleOp> scalaris_single;
        private String note;
       
        public MyScalarisSingleRunnable(TransactionSingleOp.RequestList requests, ArrayBlockingQueue<TransactionSingleOp> scalaris_single, String note) {
            this.requests = requests;
            this.scalaris_single = scalaris_single;
            this.note = note;
        }
       
        @Override
        public void run() {
            TransactionSingleOp scalaris_single;
            try {
                scalaris_single = this.scalaris_single.take();
            } catch (InterruptedException e) {
                System.err.println("write of " + note + " interrupted while getting connection to Scalaris");
                throw new RuntimeException(e);
            }
            try {
                TransactionSingleOp.ResultList results = scalaris_single.req_list(requests);
                for (int i = 0; i < results.size(); ++i) {
                    results.processWriteAt(i);
                }
            } catch (ConnectionException e) {
                System.err.println("write of " + note + " failed with connection error");
            } catch (TimeoutException e) {
                System.err.println("write of " + note + " failed with timeout");
            } catch (AbortException e) {
                System.err.println("write of " + note + " failed with abort");
            }
            if (scalaris_single != null) {
                try {
                    this.scalaris_single.put(scalaris_single);
                } catch (InterruptedException e) {
                    System.err.println("Interrupted while putting back a connection to Scalaris");
                    throw new RuntimeException(e);
                }
            }
        }
    }

    /**
     * Processes page list update requests to Scalaris in a separate thread.
     * Takes one of the available {@link #scalaris_tx} connections.
     *
     * @author Nico Kruber, kruber@zib.de
     */
    private static class MyScalarisAddToPageListRunnable implements Runnable {
        private String scalaris_pageList_key;
        private List<String> newEntries;
        private ArrayBlockingQueue<Transaction> scalaris_tx;
        private String scalaris_pageCount_key = null;
       
        public MyScalarisAddToPageListRunnable(String scalaris_key, List<String> newEntries, ArrayBlockingQueue<Transaction> scalaris_tx) {
            this.scalaris_pageList_key = scalaris_key;
            this.newEntries = newEntries;
            this.scalaris_tx = scalaris_tx;
        }
       
        public MyScalarisAddToPageListRunnable(String scalaris_pageList_key, List<String> newEntries, ArrayBlockingQueue<Transaction> scalaris_tx, String scalaris_pageCount_key) {
            this.scalaris_pageList_key = scalaris_pageList_key;
            this.newEntries = newEntries;
            this.scalaris_tx = scalaris_tx;
            this.scalaris_pageCount_key = scalaris_pageCount_key;
        }
       
        @Override
        public void run() {
            Transaction scalaris_tx;
            try {
                scalaris_tx = this.scalaris_tx.take();
            } catch (InterruptedException e) {
                System.err.println("update of " + scalaris_pageList_key + " and " + scalaris_pageCount_key + " interrupted while getting connection to Scalaris");
                throw new RuntimeException(e);
            }
           
            ValueResult<Integer> result = ScalarisDataHandler.updatePageList(
                    scalaris_tx, scalaris_pageList_key, scalaris_pageCount_key,
                    new LinkedList<String>(), newEntries, "");
            if (!result.success) {
                System.err.println(result.message);
            }
           
            if (scalaris_tx != null) {
                try {
                    this.scalaris_tx.put(scalaris_tx);
                } catch (InterruptedException e) {
                    System.err.println("update of " + scalaris_pageList_key + " and " + scalaris_pageCount_key + " interrupted while putting back a connection to Scalaris");
                    throw new RuntimeException(e);
                }
            }
        }
    }
}
TOP

Related Classes of de.zib.scalaris.examples.wikipedia.data.xml.WikiDumpToScalarisHandler$MyScalarisSingleRunnable

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.