/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.flume.source.twitter;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumWriter;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDrivenSource;
import org.apache.flume.annotations.InterfaceAudience;
import org.apache.flume.annotations.InterfaceStability;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.EventBuilder;
import org.apache.flume.source.AbstractSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import twitter4j.MediaEntity;
import twitter4j.StallWarning;
import twitter4j.Status;
import twitter4j.StatusDeletionNotice;
import twitter4j.StatusListener;
import twitter4j.TwitterStream;
import twitter4j.TwitterStreamFactory;
import twitter4j.User;
import twitter4j.auth.AccessToken;
/**
* Demo Flume source that connects via Streaming API to the 1% sample twitter
* firehose, continously downloads tweets, converts them to Avro format and
* sends Avro events to a downstream Flume sink.
*
* Requires the consumer and access tokens and secrets of a Twitter developer
* account
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class TwitterSource
extends AbstractSource
implements EventDrivenSource, Configurable, StatusListener {
private TwitterStream twitterStream;
private Schema avroSchema;
private long docCount = 0;
private long startTime = 0;
private long exceptionCount = 0;
private long totalTextIndexed = 0;
private long skippedDocs = 0;
private long batchEndTime = 0;
private final List<Record> docs = new ArrayList<Record>();
private final ByteArrayOutputStream serializationBuffer =
new ByteArrayOutputStream();
private DataFileWriter<GenericRecord> dataFileWriter;
private int maxBatchSize = 1000;
private int maxBatchDurationMillis = 1000;
// Fri May 14 02:52:55 +0000 2010
private SimpleDateFormat formatterTo =
new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
private DecimalFormat numFormatter = new DecimalFormat("###,###.###");
private static int REPORT_INTERVAL = 100;
private static int STATS_INTERVAL = REPORT_INTERVAL * 10;
private static final Logger LOGGER =
LoggerFactory.getLogger(TwitterSource.class);
public TwitterSource() {
}
@Override
public void configure(Context context) {
String consumerKey = context.getString("consumerKey");
String consumerSecret = context.getString("consumerSecret");
String accessToken = context.getString("accessToken");
String accessTokenSecret = context.getString("accessTokenSecret");
LOGGER.info("Consumer Key: '" + consumerKey + "'");
LOGGER.info("Consumer Secret: '" + consumerSecret + "'");
LOGGER.info("Access Token: '" + accessToken + "'");
LOGGER.info("Access Token Secret: '" + accessTokenSecret + "'");
twitterStream = new TwitterStreamFactory().getInstance();
twitterStream.setOAuthConsumer(consumerKey, consumerSecret);
twitterStream.setOAuthAccessToken(new AccessToken(accessToken,
accessTokenSecret));
twitterStream.addListener(this);
avroSchema = createAvroSchema();
dataFileWriter = new DataFileWriter<GenericRecord>(
new GenericDatumWriter<GenericRecord>(avroSchema));
maxBatchSize = context.getInteger("maxBatchSize", maxBatchSize);
maxBatchDurationMillis = context.getInteger("maxBatchDurationMillis",
maxBatchDurationMillis);
}
@Override
public synchronized void start() {
LOGGER.info("Starting twitter source {} ...", this);
docCount = 0;
startTime = System.currentTimeMillis();
exceptionCount = 0;
totalTextIndexed = 0;
skippedDocs = 0;
batchEndTime = System.currentTimeMillis() + maxBatchDurationMillis;
twitterStream.sample();
LOGGER.info("Twitter source {} started.", getName());
// This should happen at the end of the start method, since this will
// change the lifecycle status of the component to tell the Flume
// framework that this component has started. Doing this any earlier
// tells the framework that the component started successfully, even
// if the method actually fails later.
super.start();
}
@Override
public synchronized void stop() {
LOGGER.info("Twitter source {} stopping...", getName());
twitterStream.shutdown();
super.stop();
LOGGER.info("Twitter source {} stopped.", getName());
}
public void onStatus(Status status) {
Record doc = extractRecord("", avroSchema, status);
if (doc == null) {
return; // skip
}
docs.add(doc);
if (docs.size() >= maxBatchSize ||
System.currentTimeMillis() >= batchEndTime) {
batchEndTime = System.currentTimeMillis() + maxBatchDurationMillis;
byte[] bytes;
try {
bytes = serializeToAvro(avroSchema, docs);
} catch (IOException e) {
LOGGER.error("Exception while serializing tweet", e);
return; //skip
}
Event event = EventBuilder.withBody(bytes);
getChannelProcessor().processEvent(event); // send event to the flume sink
docs.clear();
}
docCount++;
if ((docCount % REPORT_INTERVAL) == 0) {
LOGGER.info(String.format("Processed %s docs",
numFormatter.format(docCount)));
}
if ((docCount % STATS_INTERVAL) == 0) {
logStats();
}
}
private Schema createAvroSchema() {
Schema avroSchema = Schema.createRecord("Doc", "adoc", null, false);
List<Field> fields = new ArrayList<Field>();
fields.add(new Field("id", Schema.create(Type.STRING), null, null));
fields.add(new Field("user_friends_count",
createOptional(Schema.create(Type.INT)),
null, null));
fields.add(new Field("user_location",
createOptional(Schema.create(Type.STRING)),
null, null));
fields.add(new Field("user_description",
createOptional(Schema.create(Type.STRING)),
null, null));
fields.add(new Field("user_statuses_count",
createOptional(Schema.create(Type.INT)),
null, null));
fields.add(new Field("user_followers_count",
createOptional(Schema.create(Type.INT)),
null, null));
fields.add(new Field("user_name",
createOptional(Schema.create(Type.STRING)),
null, null));
fields.add(new Field("user_screen_name",
createOptional(Schema.create(Type.STRING)),
null, null));
fields.add(new Field("created_at",
createOptional(Schema.create(Type.STRING)),
null, null));
fields.add(new Field("text",
createOptional(Schema.create(Type.STRING)),
null, null));
fields.add(new Field("retweet_count",
createOptional(Schema.create(Type.LONG)),
null, null));
fields.add(new Field("retweeted",
createOptional(Schema.create(Type.BOOLEAN)),
null, null));
fields.add(new Field("in_reply_to_user_id",
createOptional(Schema.create(Type.LONG)),
null, null));
fields.add(new Field("source",
createOptional(Schema.create(Type.STRING)),
null, null));
fields.add(new Field("in_reply_to_status_id",
createOptional(Schema.create(Type.LONG)),
null, null));
fields.add(new Field("media_url_https",
createOptional(Schema.create(Type.STRING)),
null, null));
fields.add(new Field("expanded_url",
createOptional(Schema.create(Type.STRING)),
null, null));
avroSchema.setFields(fields);
return avroSchema;
}
private Record extractRecord(String idPrefix, Schema avroSchema, Status status) {
User user = status.getUser();
Record doc = new Record(avroSchema);
doc.put("id", idPrefix + status.getId());
doc.put("created_at", formatterTo.format(status.getCreatedAt()));
doc.put("retweet_count", status.getRetweetCount());
doc.put("retweeted", status.isRetweet());
doc.put("in_reply_to_user_id", status.getInReplyToUserId());
doc.put("in_reply_to_status_id", status.getInReplyToStatusId());
addString(doc, "source", status.getSource());
addString(doc, "text", status.getText());
MediaEntity[] mediaEntities = status.getMediaEntities();
if (mediaEntities.length > 0) {
addString(doc, "media_url_https", mediaEntities[0].getMediaURLHttps());
addString(doc, "expanded_url", mediaEntities[0].getExpandedURL());
}
doc.put("user_friends_count", user.getFriendsCount());
doc.put("user_statuses_count", user.getStatusesCount());
doc.put("user_followers_count", user.getFollowersCount());
addString(doc, "user_location", user.getLocation());
addString(doc, "user_description", user.getDescription());
addString(doc, "user_screen_name", user.getScreenName());
addString(doc, "user_name", user.getName());
return doc;
}
private byte[] serializeToAvro(Schema avroSchema, List<Record> docList)
throws IOException {
serializationBuffer.reset();
dataFileWriter.create(avroSchema, serializationBuffer);
for (Record doc2 : docList) {
dataFileWriter.append(doc2);
}
dataFileWriter.close();
return serializationBuffer.toByteArray();
}
private Schema createOptional(Schema schema) {
return Schema.createUnion(Arrays.asList(
new Schema[] { schema, Schema.create(Type.NULL) }));
}
private void addString(Record doc, String avroField, String val) {
if (val == null) {
return;
}
doc.put(avroField, val);
totalTextIndexed += val.length();
}
private void logStats() {
double mbIndexed = totalTextIndexed / (1024 * 1024.0);
long seconds = (System.currentTimeMillis() - startTime) / 1000;
seconds = Math.max(seconds, 1);
LOGGER.info(String.format("Total docs indexed: %s, total skipped docs: %s",
numFormatter.format(docCount), numFormatter.format(skippedDocs)));
LOGGER.info(String.format(" %s docs/second",
numFormatter.format(docCount / seconds)));
LOGGER.info(String.format("Run took %s seconds and processed:",
numFormatter.format(seconds)));
LOGGER.info(String.format(" %s MB/sec sent to index",
numFormatter.format(((float) totalTextIndexed / (1024 * 1024)) / seconds)));
LOGGER.info(String.format(" %s MB text sent to index",
numFormatter.format(mbIndexed)));
LOGGER.info(String.format("There were %s exceptions ignored: ",
numFormatter.format(exceptionCount)));
}
public void onDeletionNotice(StatusDeletionNotice statusDeletionNotice) {
// Do nothing...
}
public void onScrubGeo(long userId, long upToStatusId) {
// Do nothing...
}
public void onStallWarning(StallWarning warning) {
// Do nothing...
}
public void onTrackLimitationNotice(int numberOfLimitedStatuses) {
// Do nothing...
}
public void onException(Exception e) {
LOGGER.error("Exception while streaming tweets", e);
}
}