/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flume.sink.solr.morphline;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.flume.Channel;
import org.apache.flume.ChannelSelector;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.channel.ChannelProcessor;
import org.apache.flume.channel.MemoryChannel;
import org.apache.flume.channel.ReplicatingChannelSelector;
import org.apache.flume.conf.Configurables;
import org.apache.flume.event.EventBuilder;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.cloudera.cdk.morphline.api.MorphlineContext;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.FaultTolerance;
import com.cloudera.cdk.morphline.base.Fields;
import com.cloudera.cdk.morphline.solr.DocumentLoader;
import com.cloudera.cdk.morphline.solr.SolrLocator;
import com.cloudera.cdk.morphline.solr.SolrMorphlineContext;
import com.cloudera.cdk.morphline.solr.SolrServerDocumentLoader;
import com.cloudera.cdk.morphline.solr.TestEmbeddedSolrServer;
import com.codahale.metrics.MetricRegistry;
import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.io.Files;
public class TestMorphlineSolrSink extends SolrTestCaseJ4 {
private EmbeddedSource source;
private SolrServer solrServer;
private MorphlineSink sink;
private Map<String,Integer> expectedRecords;
private File tmpFile;
private static final boolean TEST_WITH_EMBEDDED_SOLR_SERVER = true;
private static final String EXTERNAL_SOLR_SERVER_URL = System.getProperty("externalSolrServer");
//private static final String EXTERNAL_SOLR_SERVER_URL = "http://127.0.0.1:8983/solr";
private static final String RESOURCES_DIR = "target/test-classes";
//private static final String RESOURCES_DIR = "src/test/resources";
private static final AtomicInteger SEQ_NUM = new AtomicInteger();
private static final AtomicInteger SEQ_NUM2 = new AtomicInteger();
private static final Logger LOGGER = LoggerFactory.getLogger(TestMorphlineSolrSink.class);
@BeforeClass
public static void beforeClass() throws Exception {
initCore(
RESOURCES_DIR + "/solr/collection1/conf/solrconfig.xml",
RESOURCES_DIR + "/solr/collection1/conf/schema.xml",
RESOURCES_DIR + "/solr"
);
}
@Before
@Override
public void setUp() throws Exception {
super.setUp();
String path = RESOURCES_DIR + "/test-documents";
expectedRecords = new HashMap();
expectedRecords.put(path + "/sample-statuses-20120906-141433.avro", 2);
expectedRecords.put(path + "/sample-statuses-20120906-141433", 2);
expectedRecords.put(path + "/sample-statuses-20120906-141433.gz", 2);
expectedRecords.put(path + "/sample-statuses-20120906-141433.bz2", 2);
expectedRecords.put(path + "/cars.csv", 5);
expectedRecords.put(path + "/cars.csv.gz", 5);
expectedRecords.put(path + "/cars.tar.gz", 4);
expectedRecords.put(path + "/cars.tsv", 5);
expectedRecords.put(path + "/cars.ssv", 5);
final Map<String, String> context = new HashMap();
if (EXTERNAL_SOLR_SERVER_URL != null) {
throw new UnsupportedOperationException();
//solrServer = new ConcurrentUpdateSolrServer(EXTERNAL_SOLR_SERVER_URL, 2, 2);
//solrServer = new SafeConcurrentUpdateSolrServer(EXTERNAL_SOLR_SERVER_URL, 2, 2);
//solrServer = new HttpSolrServer(EXTERNAL_SOLR_SERVER_URL);
} else {
if (TEST_WITH_EMBEDDED_SOLR_SERVER) {
solrServer = new TestEmbeddedSolrServer(h.getCoreContainer(), "");
} else {
throw new RuntimeException("Not yet implemented");
//solrServer = new TestSolrServer(getSolrServer());
}
}
Map<String, String> channelContext = new HashMap();
channelContext.put("capacity", "1000000");
channelContext.put("keep-alive", "0"); // for faster tests
Channel channel = new MemoryChannel();
channel.setName(channel.getClass().getName() + SEQ_NUM.getAndIncrement());
Configurables.configure(channel, new Context(channelContext));
class MySolrSink extends MorphlineSolrSink {
public MySolrSink(MorphlineHandlerImpl indexer) {
super(indexer);
}
}
int batchSize = SEQ_NUM2.incrementAndGet() % 2 == 0 ? 100 : 1;
DocumentLoader testServer = new SolrServerDocumentLoader(solrServer, batchSize);
MorphlineContext solrMorphlineContext = new SolrMorphlineContext.Builder()
.setDocumentLoader(testServer)
.setExceptionHandler(new FaultTolerance(false, false, SolrServerException.class.getName()))
.setMetricRegistry(new MetricRegistry()).build();
MorphlineHandlerImpl impl = new MorphlineHandlerImpl();
impl.setMorphlineContext(solrMorphlineContext);
class MySolrLocator extends SolrLocator { // trick to access protected ctor
public MySolrLocator(MorphlineContext indexer) {
super(indexer);
}
}
SolrLocator locator = new MySolrLocator(solrMorphlineContext);
locator.setSolrHomeDir(testSolrHome + "/collection1");
String str1 = "SOLR_LOCATOR : " + locator.toString();
//File solrLocatorFile = new File("target/test-classes/test-morphlines/solrLocator.conf");
//String str1 = Files.toString(solrLocatorFile, Charsets.UTF_8);
File morphlineFile = new File("target/test-classes/test-morphlines/solrCellDocumentTypes.conf");
String str2 = Files.toString(morphlineFile, Charsets.UTF_8);
tmpFile = File.createTempFile("morphline", ".conf");
tmpFile.deleteOnExit();
Files.write(str1 + "\n" + str2, tmpFile, Charsets.UTF_8);
context.put("morphlineFile", tmpFile.getPath());
impl.configure(new Context(context));
sink = new MySolrSink(impl);
sink.setName(sink.getClass().getName() + SEQ_NUM.getAndIncrement());
sink.configure(new Context(context));
sink.setChannel(channel);
sink.start();
source = new EmbeddedSource(sink);
ChannelSelector rcs = new ReplicatingChannelSelector();
rcs.setChannels(Collections.singletonList(channel));
ChannelProcessor chp = new ChannelProcessor(rcs);
Context chpContext = new Context();
chpContext.put("interceptors", "uuidinterceptor");
chpContext.put("interceptors.uuidinterceptor.type", UUIDInterceptor.Builder.class.getName());
chp.configure(chpContext);
source.setChannelProcessor(chp);
deleteAllDocuments();
}
private void deleteAllDocuments() throws SolrServerException, IOException {
SolrServer s = solrServer;
s.deleteByQuery("*:*"); // delete everything!
s.commit();
}
@After
@Override
public void tearDown() throws Exception {
try {
if (source != null) {
source.stop();
source = null;
}
if (sink != null) {
sink.stop();
sink = null;
}
if (tmpFile != null) {
tmpFile.delete();
}
} finally {
solrServer = null;
expectedRecords = null;
super.tearDown();
}
}
@Test
public void testDocumentTypes() throws Exception {
String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] {
path + "/testBMPfp.txt",
path + "/boilerplate.html",
path + "/NullHeader.docx",
path + "/testWORD_various.doc",
path + "/testPDF.pdf",
path + "/testJPEG_EXIF.jpg",
path + "/testXML.xml",
// path + "/cars.csv",
// path + "/cars.tsv",
// path + "/cars.ssv",
// path + "/cars.csv.gz",
// path + "/cars.tar.gz",
path + "/sample-statuses-20120906-141433.avro",
path + "/sample-statuses-20120906-141433",
path + "/sample-statuses-20120906-141433.gz",
path + "/sample-statuses-20120906-141433.bz2",
};
testDocumentTypesInternal(files);
}
@Test
public void testDocumentTypes2() throws Exception {
String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] {
path + "/testPPT_various.ppt",
path + "/testPPT_various.pptx",
path + "/testEXCEL.xlsx",
path + "/testEXCEL.xls",
path + "/testPages.pages",
path + "/testNumbers.numbers",
path + "/testKeynote.key",
path + "/testRTFVarious.rtf",
path + "/complex.mbox",
path + "/test-outlook.msg",
path + "/testEMLX.emlx",
// path + "/testRFC822",
path + "/rsstest.rss",
// path + "/testDITA.dita",
path + "/testMP3i18n.mp3",
path + "/testAIFF.aif",
path + "/testFLAC.flac",
// path + "/testFLAC.oga",
// path + "/testVORBIS.ogg",
path + "/testMP4.m4a",
path + "/testWAV.wav",
// path + "/testWMA.wma",
path + "/testFLV.flv",
// path + "/testWMV.wmv",
path + "/testBMP.bmp",
path + "/testPNG.png",
path + "/testPSD.psd",
path + "/testSVG.svg",
path + "/testTIFF.tif",
// path + "/test-documents.7z",
// path + "/test-documents.cpio",
// path + "/test-documents.tar",
// path + "/test-documents.tbz2",
// path + "/test-documents.tgz",
// path + "/test-documents.zip",
// path + "/test-zip-of-zip.zip",
// path + "/testJAR.jar",
// path + "/testKML.kml",
// path + "/testRDF.rdf",
path + "/testTrueType.ttf",
path + "/testVISIO.vsd",
// path + "/testWAR.war",
// path + "/testWindows-x86-32.exe",
// path + "/testWINMAIL.dat",
// path + "/testWMF.wmf",
};
testDocumentTypesInternal(files);
}
@Test
public void testAvroRoundTrip() throws Exception {
String file = RESOURCES_DIR + "/test-documents" + "/sample-statuses-20120906-141433.avro";
testDocumentTypesInternal(file);
QueryResponse rsp = query("*:*");
Iterator<SolrDocument> iter = rsp.getResults().iterator();
ListMultimap<String, String> expectedFieldValues;
expectedFieldValues = ImmutableListMultimap.of("id", "1234567890", "text", "sample tweet one", "user_screen_name", "fake_user1");
assertEquals(expectedFieldValues, next(iter));
expectedFieldValues = ImmutableListMultimap.of("id", "2345678901", "text", "sample tweet two", "user_screen_name", "fake_user2");
assertEquals(expectedFieldValues, next(iter));
assertFalse(iter.hasNext());
}
private ListMultimap<String, Object> next(Iterator<SolrDocument> iter) {
SolrDocument doc = iter.next();
Record record = toRecord(doc);
record.removeAll("_version_"); // the values of this field are unknown and internal to solr
return record.getFields();
}
private Record toRecord(SolrDocument doc) {
Record record = new Record();
for (String key : doc.keySet()) {
record.getFields().replaceValues(key, doc.getFieldValues(key));
}
return record;
}
private void testDocumentTypesInternal(String... files) throws Exception {
int numDocs = 0;
long startTime = System.currentTimeMillis();
assertEquals(numDocs, queryResultSetSize("*:*"));
// assertQ(req("*:*"), "//*[@numFound='0']");
for (int i = 0; i < 1; i++) {
for (String file : files) {
File f = new File(file);
byte[] body = Files.toByteArray(f);
Event event = EventBuilder.withBody(body);
event.getHeaders().put(Fields.ATTACHMENT_NAME, f.getName());
load(event);
Integer count = expectedRecords.get(file);
if (count != null) {
numDocs += count;
} else {
numDocs++;
}
assertEquals(numDocs, queryResultSetSize("*:*"));
}
LOGGER.trace("iter: {}", i);
}
LOGGER.trace("all done with put at {}", System.currentTimeMillis() - startTime);
assertEquals(numDocs, queryResultSetSize("*:*"));
LOGGER.trace("sink: ", sink);
}
// @Test
public void benchmarkDocumentTypes() throws Exception {
int iters = 200;
// LogManager.getLogger(getClass().getPackage().getName()).setLevel(Level.INFO);
assertEquals(0, queryResultSetSize("*:*"));
String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] {
// path + "/testBMPfp.txt",
// path + "/boilerplate.html",
// path + "/NullHeader.docx",
// path + "/testWORD_various.doc",
// path + "/testPDF.pdf",
// path + "/testJPEG_EXIF.jpg",
// path + "/testXML.xml",
// path + "/cars.csv",
// path + "/cars.csv.gz",
// path + "/cars.tar.gz",
// path + "/sample-statuses-20120906-141433.avro",
path + "/sample-statuses-20120906-141433-medium.avro",
};
List<Event> events = new ArrayList();
for (String file : files) {
File f = new File(file);
byte[] body = Files.toByteArray(f);
Event event = EventBuilder.withBody(body);
// event.getHeaders().put(Metadata.RESOURCE_NAME_KEY, f.getName());
events.add(event);
}
long startTime = System.currentTimeMillis();
for (int i = 0; i < iters; i++) {
if (i % 10000 == 0) {
LOGGER.info("iter: {}", i);
}
for (Event event : events) {
event = EventBuilder.withBody(event.getBody(), new HashMap(event.getHeaders()));
event.getHeaders().put("id", UUID.randomUUID().toString());
load(event);
}
}
float secs = (System.currentTimeMillis() - startTime) / 1000.0f;
long numDocs = queryResultSetSize("*:*");
LOGGER.info("Took secs: " + secs + ", iters/sec: " + (iters/secs));
LOGGER.info("Took secs: " + secs + ", docs/sec: " + (numDocs/secs));
LOGGER.info("Iterations: " + iters + ", numDocs: " + numDocs);
LOGGER.info("sink: ", sink);
}
private void load(Event event) throws EventDeliveryException {
source.load(event);
}
private void commit() throws SolrServerException, IOException {
solrServer.commit(false, true, true);
}
private int queryResultSetSize(String query) throws SolrServerException, IOException {
commit();
QueryResponse rsp = query(query);
LOGGER.debug("rsp: {}", rsp);
int size = rsp.getResults().size();
return size;
}
private QueryResponse query(String query) throws SolrServerException, IOException {
commit();
QueryResponse rsp = solrServer.query(new SolrQuery(query).setRows(Integer.MAX_VALUE));
LOGGER.debug("rsp: {}", rsp);
return rsp;
}
}