Package org.archive.crawler.frontier

Source Code of org.archive.crawler.frontier.AMQPUrlReceiver$StarterRestarter

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.archive.crawler.frontier;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.archive.crawler.event.CrawlStateEvent;
import org.archive.crawler.framework.Frontier;
import org.archive.modules.CrawlURI;
import org.archive.modules.SchedulingConstants;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationListener;
import org.springframework.context.Lifecycle;

import com.rabbitmq.client.AMQP.BasicProperties;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.Connection;
import com.rabbitmq.client.ConnectionFactory;
import com.rabbitmq.client.Consumer;
import com.rabbitmq.client.DefaultConsumer;
import com.rabbitmq.client.Envelope;
import com.rabbitmq.client.ShutdownSignalException;

/**
* @contributor nlevitt
*/
public class AMQPUrlReceiver implements Lifecycle, ApplicationListener<CrawlStateEvent> {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 1L;

    private static final Logger logger =
            Logger.getLogger(AMQPUrlReceiver.class.getName());

    public static final String A_RECEIVED_FROM_AMQP = "receivedFromAMQP";

    protected Frontier frontier;
    public Frontier getFrontier() {
        return this.frontier;
    }
    @Autowired
    public void setFrontier(Frontier frontier) {
        this.frontier = frontier;
    }

    protected String amqpUri = "amqp://guest:guest@localhost:5672/%2f";
    public String getAmqpUri() {
        return this.amqpUri;
    }
    public void setAmqpUri(String uri) {
        this.amqpUri = uri;
    }

    protected String exchange = "umbra";
    public String getExchange() {
        return exchange;
    }
    public void setExchange(String exchange) {
        this.exchange = exchange;
    }

    protected String queueName = "requests";
    public String getQueueName() {
        return queueName;
    }
    public void setQueueName(String queueName) {
        this.queueName = queueName;
    }

    protected boolean isRunning = false;
    @Override
    public boolean isRunning() {
        return isRunning;
    }
   
    private transient Lock lock = new ReentrantLock(true);

    private class StarterRestarter extends Thread {
        public StarterRestarter(String name) {
            super(name);
        }

        @Override
        public void run() {
            while (!Thread.interrupted()) {
                try {
                    lock.lockInterruptibly();
                    if (!isRunning) {
                        // start up again
                        try {
                            Consumer consumer = new UrlConsumer(channel());
                            channel().queueDeclare(getQueueName(), false, false, true, null);
                            channel().queueBind(getQueueName(), getExchange(), getQueueName());
                            channel().basicConsume(getQueueName(), false, consumer);
                            isRunning = true;
                            logger.info("started AMQP consumer uri=" + getAmqpUri() + " exchange=" + getExchange() + " queueName=" + getQueueName());
                        } catch (IOException e) {
                            logger.log(Level.SEVERE, "problem starting AMQP consumer (will try again after 30 seconds)", e);
                        }
                    }

                    Thread.sleep(30000);
                } catch (InterruptedException e) {
                    return;
                } finally {
                    lock.unlock();
                }
            }
        }
    }

    transient private StarterRestarter starterRestarter;
   
    @Override
    public void start() {
        lock.lock();
        try {
            // spawn off a thread to start up the amqp consumer, and try to restart it if it dies
            if (!isRunning) {
                starterRestarter = new StarterRestarter(AMQPUrlReceiver.class.getSimpleName() + "-starter-restarter");
                starterRestarter.start();
            }
        } finally {
            lock.unlock();
        }
    }

    @Override
    public void stop() {
        lock.lock();
        try {
            logger.info("shutting down");
            if (connection != null && connection.isOpen()) {
                try {
                    connection.close();
                } catch (IOException e) {
                    logger.log(Level.SEVERE, "problem closing AMQP connection", e);
                }
            }
            if (starterRestarter != null && starterRestarter.isAlive()) {
                starterRestarter.interrupt();
                try {
                    starterRestarter.join();
                } catch (InterruptedException e) {
                }
            }
            starterRestarter = null;
            connection = null;
            channel = null;
            isRunning = false;
        } finally {
            lock.unlock();
        }
    }

    transient protected Connection connection = null;
    transient protected Channel channel = null;

    protected Connection connection() throws IOException {
        lock.lock();
        try {
            if (connection != null && !connection.isOpen()) {
                logger.warning("connection is closed, creating a new one");
                connection = null;
            }

            if (connection == null) {
                ConnectionFactory factory = new ConnectionFactory();
                try {
                    factory.setUri(getAmqpUri());
                } catch (Exception e) {
                    throw new IOException("problem with AMQP uri " + getAmqpUri(), e);
                }
                connection = factory.newConnection();
            }

            return connection;
        } finally {
            lock.unlock();
        }
    }

    protected Channel channel() throws IOException {
        lock.lock();
        try {
            if (channel != null && !channel.isOpen()) {
                logger.warning("channel is not open, creating a new one");
                channel = null;
            }

            if (channel == null) {
                channel = connection().createChannel();
            }

            return channel;
        } finally {
            lock.unlock();
        }
    }

    // XXX should we be using QueueingConsumer because of possible blocking in
    // frontier.schedule()?
    // "Note: all methods of this interface are invoked inside the Connection's
    // thread. This means they a) should be non-blocking and generally do little
    // work, b) must not call Channel or Connection methods, or a deadlock will
    // ensue. One way of ensuring this is to use/subclass QueueingConsumer."
    protected class UrlConsumer extends DefaultConsumer {
        public UrlConsumer(Channel channel) {
            super(channel);
        }

        @Override
        public void handleDelivery(String consumerTag, Envelope envelope,
                BasicProperties properties, byte[] body) throws IOException {
            String decodedBody;
            try {
                decodedBody = new String(body, "UTF-8");
            } catch (UnsupportedEncodingException e) {
                throw new RuntimeException(e); // can't happen
            }
            JSONObject jo = new JSONObject(decodedBody);
           
            if ("GET".equals(jo.getString("method"))) {
                CrawlURI curi;
                try {
                    curi = makeCrawlUri(jo);
                    // bypasses scoping (unless rechecking is configured)
                    getFrontier().schedule(curi);
                    if (logger.isLoggable(Level.FINE)) {
                        logger.fine("scheduled " + curi);
                    }
                } catch (URIException e) {
                    logger.log(Level.WARNING,
                            "problem creating CrawlURI from json received via AMQP "
                                    + decodedBody, e);
                } catch (JSONException e) {
                    logger.log(Level.SEVERE,
                            "problem creating CrawlURI from json received via AMQP "
                                    + decodedBody, e);
                }
            } else {
                logger.warning("ignoring url with method other than GET - " + decodedBody);
            }

            this.getChannel().basicAck(envelope.getDeliveryTag(), false);
        }
       
        @Override
        public void handleShutdownSignal(String consumerTag,
                ShutdownSignalException sig) {
            if (!sig.isInitiatedByApplication()) {
                logger.log(Level.SEVERE, "amqp channel/connection unexpectedly shut down consumerTag=" + consumerTag, sig);
            } else {
                logger.info("amqp channel/connection shut down consumerTag=" + consumerTag);
            }
            isRunning = false;
        }

        // {
        //  "headers": {
        //   "Referer": "https://archive.org/",
        //   "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/32.0.1700.102 Chrome/32.0.1700.102 Safari/537.36",
        //   "Accept": "image/webp,*/*;q=0.8"
        //  },
        //  "url": "https://analytics.archive.org/0.gif?server_ms=256&server_name=www19.us.archive.org&service=ao&loadtime=358&timediff=-8&locale=en-US&referrer=-&version=2&count=9",
        //  "method": "GET"
        // }
        @SuppressWarnings("unchecked")
        protected CrawlURI makeCrawlUri(JSONObject jo) throws URIException,
                JSONException {
            JSONObject joHeaders = jo.getJSONObject("headers");

            UURI uuri = UURIFactory.getInstance(jo.getString("url"));
            UURI via = UURIFactory.getInstance(jo.getString("parentUrl"));

            JSONObject parentUrlMetadata = jo.getJSONObject("parentUrlMetadata");
            String parentHopPath = parentUrlMetadata.getString("pathFromSeed");
            String hopPath = parentHopPath + Hop.INFERRED.getHopString();

            CrawlURI curi = new CrawlURI(uuri, hopPath, via, LinkContext.INFERRED_MISC);
           
            // set the heritable data from the parent url, passed back to us via amqp
            // XXX brittle, only goes one level deep, and only handles strings and arrays, the latter of which it converts to a Set.
            // 'heritableData': {'source': 'https://facebook.com/whitehouse/', 'heritable': ['source', 'heritable']}
            JSONObject heritableData = parentUrlMetadata.getJSONObject("heritableData");
            for (String key: (Set<String>) heritableData.keySet()) {
                Object value = heritableData.get(key);
                if (value instanceof JSONArray) {
                    Set<String> valueSet = new HashSet<String>();
                    JSONArray arr = ((JSONArray) value);
                    for (int i = 0; i < arr.length(); i++) {
                        valueSet.add(arr.getString(i));
                    }
                    curi.getData().put(key, valueSet);
                } else {
                    curi.getData().put(key, heritableData.get(key));
                }
            }

            // set the http headers from the amqp message
            Map<String, String> customHttpRequestHeaders = new HashMap<String, String>();
            for (Object key : joHeaders.keySet()) {
                customHttpRequestHeaders.put(key.toString(),
                        joHeaders.getString(key.toString()));
            }
            curi.getData().put("customHttpRequestHeaders", customHttpRequestHeaders);

            /* Use HighestUriQueuePrecedencePolicy to ensure these high priority
             * urls really get crawled ahead of others.
             * See https://webarchive.jira.com/wiki/display/Heritrix/Precedence+Feature+Notes
             */
            curi.setSchedulingDirective(SchedulingConstants.HIGH);
            curi.setPrecedence(1);
           
            //curi.setForceFetch(true);

            curi.getAnnotations().add(A_RECEIVED_FROM_AMQP);

            return curi;
        }
    }

    @Override
    public void onApplicationEvent(CrawlStateEvent event) {
        switch(event.getState()) {
        case PAUSING: case PAUSED:
            if (channel != null && channel.isOpen()) {
                try {
                    channel.flow(false);
                } catch (IOException e) {
                    logger.log(Level.WARNING, "failed to pause flow on amqp channel", e);
                }
            }
            break;

        case RUNNING: case EMPTY: case PREPARING:
            if (channel != null && channel.isOpen()) {
                try {
                    channel.flow(true);
                } catch (IOException e) {
                    logger.log(Level.SEVERE, "failed to resume flow on amqp channel", e);
                }
            }
            break;

        default:
        }
    }
}
TOP

Related Classes of org.archive.crawler.frontier.AMQPUrlReceiver$StarterRestarter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.