Package com.pinterest.secor.parser

Source Code of com.pinterest.secor.parser.PartitionFinalizer

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.pinterest.secor.parser;

import com.pinterest.secor.common.*;
import com.pinterest.secor.message.Message;
import com.pinterest.secor.util.CompressionUtil;
import com.pinterest.secor.util.FileUtil;

import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Partition finalizer writes _SUCCESS files to date partitions that very likely won't be receiving
* any new messages. It also adds those partitions to Hive.
*
* @author Pawel Garbacki (pawel@pinterest.com)
*/
public class PartitionFinalizer {
    private static final Logger LOG = LoggerFactory.getLogger(PartitionFinalizer.class);

    private SecorConfig mConfig;
    private ZookeeperConnector mZookeeperConnector;
    private ThriftMessageParser mThriftMessageParser;
    private KafkaClient mKafkaClient;
    private QuboleClient mQuboleClient;
    private String mFileExtension;

    public PartitionFinalizer(SecorConfig config) throws Exception {
        mConfig = config;
        mKafkaClient = new KafkaClient(mConfig);
        mZookeeperConnector = new ZookeeperConnector(mConfig);
        mThriftMessageParser = new ThriftMessageParser(mConfig);
        mQuboleClient = new QuboleClient(mConfig);
        if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) {
            CompressionCodec codec = CompressionUtil.createCompressionCodec(mConfig.getCompressionCodec());
            mFileExtension = codec.getDefaultExtension();
        } else {
            mFileExtension = "";
        }
    }

    private long getLastTimestampMillis(TopicPartition topicPartition) throws TException {
        Message message = mKafkaClient.getLastMessage(topicPartition);
        return mThriftMessageParser.extractTimestampMillis(message);
    }

    private long getLastTimestampMillis(String topic) throws TException {
        final int numPartitions = mKafkaClient.getNumPartitions(topic);
        long max_timestamp = Long.MIN_VALUE;
        for (int partition = 0; partition < numPartitions; ++partition) {
            TopicPartition topicPartition = new TopicPartition(topic, partition);
            long timestamp = getLastTimestampMillis(topicPartition);
            if (timestamp > max_timestamp) {
                max_timestamp = timestamp;
            }
        }
        if (max_timestamp == Long.MIN_VALUE) {
            return -1;
        }
        return max_timestamp;
    }

    private long getCommittedTimestampMillis(TopicPartition topicPartition) throws Exception {
        Message message = mKafkaClient.getCommittedMessage(topicPartition);
        if (message == null) {
            LOG.error("No message found for topic " + topicPartition.getTopic() + " partition " +
                    topicPartition.getPartition());
            return -1;
        }
        return mThriftMessageParser.extractTimestampMillis(message);
    }

    private long getCommittedTimestampMillis(String topic) throws Exception {
        final int numPartitions = mKafkaClient.getNumPartitions(topic);
        long minTimestamp = Long.MAX_VALUE;
        for (int partition = 0; partition < numPartitions; ++partition) {
            TopicPartition topicPartition = new TopicPartition(topic, partition);
            long timestamp = getCommittedTimestampMillis(topicPartition);
            if (timestamp == -1) {
                return -1;
            } else {
                if (timestamp < minTimestamp) {
                    minTimestamp = timestamp;
                }
            }
        }
        if (minTimestamp == Long.MAX_VALUE) {
            return -1;
        }
        return minTimestamp;
    }

    private NavigableSet<Calendar> getPartitions(String topic) throws IOException, ParseException {
        final String s3Prefix = "s3n://" + mConfig.getS3Bucket() + "/" + mConfig.getS3Path();
        String[] partitions = {"dt="};
        LogFilePath logFilePath = new LogFilePath(s3Prefix, topic, partitions,
            mConfig.getGeneration(), 0, 0, mFileExtension);
        String parentDir = logFilePath.getLogFileParentDir();
        String[] partitionDirs = FileUtil.list(parentDir);
        Pattern pattern = Pattern.compile(".*/dt=(\\d\\d\\d\\d-\\d\\d-\\d\\d)$");
        TreeSet<Calendar> result = new TreeSet<Calendar>();
        for (String partitionDir : partitionDirs) {
            Matcher matcher = pattern.matcher(partitionDir);
            if (matcher.find()) {
                String date = matcher.group(1);
                SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
                format.setTimeZone(TimeZone.getTimeZone("UTC"));
                Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
                calendar.setTime(format.parse(date));
                result.add(calendar);
            }
        }
        return result;
    }

    private void finalizePartitionsUpTo(String topic, Calendar calendar) throws IOException,
            ParseException, InterruptedException {
        NavigableSet<Calendar> partitionDates =
            getPartitions(topic).headSet(calendar, true).descendingSet();
        final String s3Prefix = "s3n://" + mConfig.getS3Bucket() + "/" + mConfig.getS3Path();
        SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
        format.setTimeZone(TimeZone.getTimeZone("UTC"));
        for (Calendar partition : partitionDates) {
            String partitionStr = format.format(partition.getTime());
            String[] partitions = {"dt=" + partitionStr};
            LogFilePath logFilePath = new LogFilePath(s3Prefix, topic, partitions,
                mConfig.getGeneration(), 0, 0, mFileExtension);
            String logFileDir = logFilePath.getLogFileDir();
            assert FileUtil.exists(logFileDir) : "FileUtil.exists(" + logFileDir + ")";
            String successFilePath = logFileDir + "/_SUCCESS";
            if (FileUtil.exists(successFilePath)) {
                return;
            }
            try {
                mQuboleClient.addPartition(topic, "dt='" + partitionStr + "'");
            } catch (Exception e) {
                LOG.error("failed to finalize topic " + topic + " partition dt=" + partitionStr,
                        e);
                continue;
            }
            LOG.info("touching file " + successFilePath);
            FileUtil.touch(successFilePath);
        }
    }

    /**
     * Get finalized timestamp for a given topic partition. Finalized timestamp is the current time
     * if the last offset for that topic partition has been committed earlier than an hour ago.
     * Otherwise, finalized timestamp is the committed timestamp.
     *
     * @param topicPartition The topic partition for which we want to compute the finalized
     *                       timestamp.
     * @return The finalized timestamp for the topic partition.
     * @throws Exception
     */
    private long getFinalizedTimestampMillis(TopicPartition topicPartition) throws Exception {
        long lastTimestamp = getLastTimestampMillis(topicPartition);
        long committedTimestamp = getCommittedTimestampMillis(topicPartition);
        long now = System.currentTimeMillis();
        if (lastTimestamp == committedTimestamp && (now - lastTimestamp) > 3600 * 1000) {
            return now;
        }
        return committedTimestamp;
    }

    private long getFinalizedTimestampMillis(String topic) throws Exception {
        final int numPartitions = mKafkaClient.getNumPartitions(topic);
        long minTimestamp = Long.MAX_VALUE;
        for (int partition = 0; partition < numPartitions; ++partition) {
            TopicPartition topicPartition = new TopicPartition(topic, partition);
            long timestamp = getFinalizedTimestampMillis(topicPartition);
            LOG.info("finalized timestamp for topic " + topic + " partition " + partition +
                    " is " + timestamp);
            if (timestamp == -1) {
                return -1;
            } else {
                if (timestamp < minTimestamp) {
                    minTimestamp = timestamp;
                }
            }
        }
        if (minTimestamp == Long.MAX_VALUE) {
            return -1;
        }
        return minTimestamp;
    }

    public void finalizePartitions() throws Exception {
        List<String> topics = mZookeeperConnector.getCommittedOffsetTopics();
        for (String topic : topics) {
            if (!topic.matches(mConfig.getKafkaTopicFilter())) {
                LOG.info("skipping topic " + topic);
            } else {
                LOG.info("finalizing topic " + topic);
                long finalizedTimestampMillis = getFinalizedTimestampMillis(topic);
                LOG.info("finalized timestamp for topic " + topic + " is " +
                        finalizedTimestampMillis);
                if (finalizedTimestampMillis != -1) {
                    Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
                    calendar.setTimeInMillis(finalizedTimestampMillis);
                    // Introduce a lag of one day and one hour.
                    calendar.add(Calendar.HOUR, -1);
                    calendar.add(Calendar.DAY_OF_MONTH, -1);
                    finalizePartitionsUpTo(topic, calendar);
                }
            }
        }
    }
}
TOP

Related Classes of com.pinterest.secor.parser.PartitionFinalizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.