Package org.apache.flume.sink.solr.morphline

Source Code of org.apache.flume.sink.solr.morphline.MySolrLocator

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flume.sink.solr.morphline;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.flume.Channel;
import org.apache.flume.ChannelSelector;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.channel.ChannelProcessor;
import org.apache.flume.channel.MemoryChannel;
import org.apache.flume.channel.ReplicatingChannelSelector;
import org.apache.flume.conf.Configurables;
import org.apache.flume.event.EventBuilder;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.FaultTolerance;
import org.kitesdk.morphline.base.Fields;
import org.kitesdk.morphline.solr.DocumentLoader;
import org.kitesdk.morphline.solr.SolrLocator;
import org.kitesdk.morphline.solr.SolrMorphlineContext;
import org.kitesdk.morphline.solr.SolrServerDocumentLoader;
import org.kitesdk.morphline.solr.TestEmbeddedSolrServer;
import com.codahale.metrics.MetricRegistry;
import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.io.Files;

public class TestMorphlineSolrSink extends SolrTestCaseJ4 {

  private EmbeddedSource source;
  private SolrServer solrServer;
  private MorphlineSink sink;
  private Map<String,Integer> expectedRecords;

  private File tmpFile;
  private static final boolean TEST_WITH_EMBEDDED_SOLR_SERVER = true;
  private static final String EXTERNAL_SOLR_SERVER_URL = System.getProperty("externalSolrServer");
//private static final String EXTERNAL_SOLR_SERVER_URL = "http://127.0.0.1:8983/solr";
  private static final String RESOURCES_DIR = "target/test-classes";
//private static final String RESOURCES_DIR = "src/test/resources";
  private static final AtomicInteger SEQ_NUM = new AtomicInteger();
  private static final AtomicInteger SEQ_NUM2 = new AtomicInteger();
  private static final Logger LOGGER = LoggerFactory.getLogger(TestMorphlineSolrSink.class);

  @BeforeClass
  public static void beforeClass() throws Exception {
    initCore(
        RESOURCES_DIR + "/solr/collection1/conf/solrconfig.xml",
        RESOURCES_DIR + "/solr/collection1/conf/schema.xml",
        RESOURCES_DIR + "/solr"
        );
  }

  @Before
  @Override
  public void setUp() throws Exception {
    super.setUp();
    String path = RESOURCES_DIR + "/test-documents";
    expectedRecords = new HashMap();
    expectedRecords.put(path + "/sample-statuses-20120906-141433.avro", 2);
    expectedRecords.put(path + "/sample-statuses-20120906-141433", 2);
    expectedRecords.put(path + "/sample-statuses-20120906-141433.gz", 2);
    expectedRecords.put(path + "/sample-statuses-20120906-141433.bz2", 2);
    expectedRecords.put(path + "/cars.csv", 5);
    expectedRecords.put(path + "/cars.csv.gz", 5);
    expectedRecords.put(path + "/cars.tar.gz", 4);
    expectedRecords.put(path + "/cars.tsv", 5);
    expectedRecords.put(path + "/cars.ssv", 5);

    final Map<String, String> context = new HashMap();
   
    if (EXTERNAL_SOLR_SERVER_URL != null) {
      throw new UnsupportedOperationException();
      //solrServer = new ConcurrentUpdateSolrServer(EXTERNAL_SOLR_SERVER_URL, 2, 2);
      //solrServer = new SafeConcurrentUpdateSolrServer(EXTERNAL_SOLR_SERVER_URL, 2, 2);
      //solrServer = new HttpSolrServer(EXTERNAL_SOLR_SERVER_URL);
    } else {
      if (TEST_WITH_EMBEDDED_SOLR_SERVER) {
        solrServer = new TestEmbeddedSolrServer(h.getCoreContainer(), "");
      } else {
        throw new RuntimeException("Not yet implemented");
        //solrServer = new TestSolrServer(getSolrServer());
      }
    }

    Map<String, String> channelContext = new HashMap();
    channelContext.put("capacity", "1000000");
    channelContext.put("keep-alive", "0"); // for faster tests
    Channel channel = new MemoryChannel();
    channel.setName(channel.getClass().getName() + SEQ_NUM.getAndIncrement());
    Configurables.configure(channel, new Context(channelContext));
    class MySolrSink extends MorphlineSolrSink {
      public MySolrSink(MorphlineHandlerImpl indexer) {
        super(indexer);
      }
    }
   
    int batchSize = SEQ_NUM2.incrementAndGet() % 2 == 0 ? 100 : 1;
    DocumentLoader testServer = new SolrServerDocumentLoader(solrServer, batchSize);
    MorphlineContext solrMorphlineContext = new SolrMorphlineContext.Builder()
      .setDocumentLoader(testServer)
      .setExceptionHandler(new FaultTolerance(false, false, SolrServerException.class.getName()))
      .setMetricRegistry(new MetricRegistry()).build();
   
    MorphlineHandlerImpl impl = new MorphlineHandlerImpl();
    impl.setMorphlineContext(solrMorphlineContext);
   
    class MySolrLocator extends SolrLocator { // trick to access protected ctor
      public MySolrLocator(MorphlineContext indexer) {
        super(indexer);
      }
    }

    SolrLocator locator = new MySolrLocator(solrMorphlineContext);
    locator.setSolrHomeDir(testSolrHome + "/collection1");
    String str1 = "SOLR_LOCATOR : " + locator.toString();
    //File solrLocatorFile = new File("target/test-classes/test-morphlines/solrLocator.conf");
    //String str1 = Files.toString(solrLocatorFile, Charsets.UTF_8);
    File morphlineFile = new File("target/test-classes/test-morphlines/solrCellDocumentTypes.conf");
    String str2 = Files.toString(morphlineFile, Charsets.UTF_8);
    tmpFile = File.createTempFile("morphline", ".conf");
    tmpFile.deleteOnExit();
    Files.write(str1 + "\n" + str2, tmpFile, Charsets.UTF_8);   
    context.put("morphlineFile", tmpFile.getPath());

    impl.configure(new Context(context));
    sink = new MySolrSink(impl);
    sink.setName(sink.getClass().getName() + SEQ_NUM.getAndIncrement());
    sink.configure(new Context(context));
    sink.setChannel(channel);
    sink.start();
   
    source = new EmbeddedSource(sink);   
    ChannelSelector rcs = new ReplicatingChannelSelector();
    rcs.setChannels(Collections.singletonList(channel));
    ChannelProcessor chp = new ChannelProcessor(rcs);
    Context chpContext = new Context();
    chpContext.put("interceptors", "uuidinterceptor");
    chpContext.put("interceptors.uuidinterceptor.type", UUIDInterceptor.Builder.class.getName());
    chp.configure(chpContext);
    source.setChannelProcessor(chp);
   
    deleteAllDocuments();
  }
 
  private void deleteAllDocuments() throws SolrServerException, IOException {
    SolrServer s = solrServer;
    s.deleteByQuery("*:*"); // delete everything!
    s.commit();
  }

  @After
  @Override
  public void tearDown() throws Exception {
    try {
      if (source != null) {
        source.stop();
        source = null;
      }
      if (sink != null) {
        sink.stop();
        sink = null;
      }
      if (tmpFile != null) {
        tmpFile.delete();
      }
    } finally {
      solrServer = null;
      expectedRecords = null;
      super.tearDown();
    }
  }

  @Test
  public void testDocumentTypes() throws Exception {
    String path = RESOURCES_DIR + "/test-documents";
    String[] files = new String[] {
        path + "/testBMPfp.txt",
        path + "/boilerplate.html",
        path + "/NullHeader.docx",
        path + "/testWORD_various.doc",         
        path + "/testPDF.pdf",
        path + "/testJPEG_EXIF.jpg",
        path + "/testXML.xml",         
//        path + "/cars.csv",
//        path + "/cars.tsv",
//        path + "/cars.ssv",
//        path + "/cars.csv.gz",
//        path + "/cars.tar.gz",
        path + "/sample-statuses-20120906-141433.avro",
        path + "/sample-statuses-20120906-141433",
        path + "/sample-statuses-20120906-141433.gz",
        path + "/sample-statuses-20120906-141433.bz2",
    };
    testDocumentTypesInternal(files);
  }

  @Test
  public void testDocumentTypes2() throws Exception {
    String path = RESOURCES_DIR + "/test-documents";
    String[] files = new String[] {
        path + "/testPPT_various.ppt",
        path + "/testPPT_various.pptx",       
        path + "/testEXCEL.xlsx",
        path + "/testEXCEL.xls",
        path + "/testPages.pages",
        path + "/testNumbers.numbers",
        path + "/testKeynote.key",
       
        path + "/testRTFVarious.rtf",
        path + "/complex.mbox",
        path + "/test-outlook.msg",
        path + "/testEMLX.emlx",
//        path + "/testRFC822", 
        path + "/rsstest.rss",
//        path + "/testDITA.dita",
       
        path + "/testMP3i18n.mp3",
        path + "/testAIFF.aif",
        path + "/testFLAC.flac",
//        path + "/testFLAC.oga",
//        path + "/testVORBIS.ogg", 
        path + "/testMP4.m4a",
        path + "/testWAV.wav",
//        path + "/testWMA.wma",
       
        path + "/testFLV.flv",
//        path + "/testWMV.wmv",
       
        path + "/testBMP.bmp",
        path + "/testPNG.png",
        path + "/testPSD.psd",       
        path + "/testSVG.svg"
        path + "/testTIFF.tif",    

//        path + "/test-documents.7z",
//        path + "/test-documents.cpio",
//        path + "/test-documents.tar",
//        path + "/test-documents.tbz2",
//        path + "/test-documents.tgz",
//        path + "/test-documents.zip",
//        path + "/test-zip-of-zip.zip",
//        path + "/testJAR.jar",
       
//        path + "/testKML.kml",
//        path + "/testRDF.rdf",
        path + "/testTrueType.ttf",
        path + "/testVISIO.vsd",
//        path + "/testWAR.war",
//        path + "/testWindows-x86-32.exe",
//        path + "/testWINMAIL.dat",
//        path + "/testWMF.wmf",
    };  
    testDocumentTypesInternal(files);
  }

  @Test
  public void testAvroRoundTrip() throws Exception {
    String file = RESOURCES_DIR + "/test-documents" + "/sample-statuses-20120906-141433.avro";
    testDocumentTypesInternal(file);
    QueryResponse rsp = query("*:*");
    Iterator<SolrDocument> iter = rsp.getResults().iterator();
    ListMultimap<String, String> expectedFieldValues;
    expectedFieldValues = ImmutableListMultimap.of("id", "1234567890", "text", "sample tweet one", "user_screen_name", "fake_user1");
    assertEquals(expectedFieldValues, next(iter));
    expectedFieldValues = ImmutableListMultimap.of("id", "2345678901", "text", "sample tweet two", "user_screen_name", "fake_user2")
    assertEquals(expectedFieldValues, next(iter));
    assertFalse(iter.hasNext());
  }
 
  private ListMultimap<String, Object> next(Iterator<SolrDocument> iter) {
    SolrDocument doc = iter.next();
    Record record = toRecord(doc);
    record.removeAll("_version_"); // the values of this field are unknown and internal to solr
    return record.getFields();   
  }
 
  private Record toRecord(SolrDocument doc) {
    Record record = new Record();
    for (String key : doc.keySet()) {
      record.getFields().replaceValues(key, doc.getFieldValues(key));       
    }
    return record;
  }
 
  private void testDocumentTypesInternal(String... files) throws Exception {
    int numDocs = 0;
    long startTime = System.currentTimeMillis();
   
    assertEquals(numDocs, queryResultSetSize("*:*"));     
//  assertQ(req("*:*"), "//*[@numFound='0']");
    for (int i = 0; i < 1; i++) {     
      for (String file : files) {
        File f = new File(file);
        byte[] body = Files.toByteArray(f);
        Event event = EventBuilder.withBody(body);
        event.getHeaders().put(Fields.ATTACHMENT_NAME, f.getName());
        load(event);
        Integer count = expectedRecords.get(file);
        if (count != null) {
          numDocs += count;
        } else {
          numDocs++;
        }
        assertEquals(numDocs, queryResultSetSize("*:*"));
      }
      LOGGER.trace("iter: {}", i);
    }
    LOGGER.trace("all done with put at {}", System.currentTimeMillis() - startTime);
    assertEquals(numDocs, queryResultSetSize("*:*"));
    LOGGER.trace("sink: ", sink);
  }

//  @Test
  public void benchmarkDocumentTypes() throws Exception {
    int iters = 200;
   
//    LogManager.getLogger(getClass().getPackage().getName()).setLevel(Level.INFO);
   
    assertEquals(0, queryResultSetSize("*:*"));     
    String path = RESOURCES_DIR + "/test-documents";
    String[] files = new String[] {
//        path + "/testBMPfp.txt",
//        path + "/boilerplate.html",
//        path + "/NullHeader.docx",
//        path + "/testWORD_various.doc",         
//        path + "/testPDF.pdf",
//        path + "/testJPEG_EXIF.jpg",
//        path + "/testXML.xml",         
//        path + "/cars.csv",
//        path + "/cars.csv.gz",
//        path + "/cars.tar.gz",
//        path + "/sample-statuses-20120906-141433.avro",
        path + "/sample-statuses-20120906-141433-medium.avro",
    };
   
    List<Event> events = new ArrayList();
    for (String file : files) {
      File f = new File(file);
      byte[] body = Files.toByteArray(f);
      Event event = EventBuilder.withBody(body);
//      event.getHeaders().put(Metadata.RESOURCE_NAME_KEY, f.getName());
      events.add(event);
    }
   
    long startTime = System.currentTimeMillis();
    for (int i = 0; i < iters; i++) {
      if (i % 10000 == 0) {
        LOGGER.info("iter: {}", i);
      }
      for (Event event : events) {
        event = EventBuilder.withBody(event.getBody(), new HashMap(event.getHeaders()));
        event.getHeaders().put("id", UUID.randomUUID().toString());
        load(event);
      }
    }
   
    float secs = (System.currentTimeMillis() - startTime) / 1000.0f;
    long numDocs = queryResultSetSize("*:*");
    LOGGER.info("Took secs: " + secs + ", iters/sec: " + (iters/secs));
    LOGGER.info("Took secs: " + secs + ", docs/sec: " + (numDocs/secs));
    LOGGER.info("Iterations: " + iters + ", numDocs: " + numDocs);
    LOGGER.info("sink: ", sink);
  }

  private void load(Event event) throws EventDeliveryException {
    source.load(event);
  }

  private void commit() throws SolrServerException, IOException {
    solrServer.commit(false, true, true);
  }
 
  private int queryResultSetSize(String query) throws SolrServerException, IOException {
    commit();
    QueryResponse rsp = query(query);
    LOGGER.debug("rsp: {}", rsp);
    int size = rsp.getResults().size();
    return size;
  }
 
  private QueryResponse query(String query) throws SolrServerException, IOException {
    commit();
    QueryResponse rsp = solrServer.query(new SolrQuery(query).setRows(Integer.MAX_VALUE));
    LOGGER.debug("rsp: {}", rsp);
    return rsp;
  }
 
}
TOP

Related Classes of org.apache.flume.sink.solr.morphline.MySolrLocator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.