/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.morphline.solrcell;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.schema.IndexSchema;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
import com.cloudera.cdk.morphline.solr.AbstractSolrMorphlineTest;
public class SolrCellMorphlineTest extends AbstractSolrMorphlineTest {
private Map<String,Integer> expectedRecords = new HashMap();
private Map<String, Map<String, Object>> expectedRecordContents = new HashMap();
@Before
public void setUp() throws Exception {
super.setUp();
String path = RESOURCES_DIR + "/test-documents";
expectedRecords.put(path + "/sample-statuses-20120906-141433.avro", 2);
expectedRecords.put(path + "/sample-statuses-20120906-141433", 2);
expectedRecords.put(path + "/sample-statuses-20120906-141433.gz", 2);
expectedRecords.put(path + "/sample-statuses-20120906-141433.bz2", 2);
expectedRecords.put(path + "/cars.csv", 6);
expectedRecords.put(path + "/cars.csv.gz", 6);
expectedRecords.put(path + "/cars.tar.gz", 4);
expectedRecords.put(path + "/cars.tsv", 6);
expectedRecords.put(path + "/cars.ssv", 6);
expectedRecords.put(path + "/test-documents.7z", 9);
expectedRecords.put(path + "/test-documents.cpio", 9);
expectedRecords.put(path + "/test-documents.tar", 9);
expectedRecords.put(path + "/test-documents.tbz2", 9);
expectedRecords.put(path + "/test-documents.tgz", 9);
expectedRecords.put(path + "/test-documents.zip", 9);
expectedRecords.put(path + "/multiline-stacktrace.log", 4);
{
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "image/jpeg");
record.put("ignored_exif_isospeedratings", "400");
record.put("ignored_meta_creation_date", "2009-08-11T09:09:45");
record.put("ignored_tiff_model", "Canon EOS 40D");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put("/testJPEG_EXIF.jpg", record);
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar", record);
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar.gz", record);
}
{
String file = path + "/testWORD_various.doc";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/msword");
record.put("ignored_author", "Michael McCandless");
record.put("ignored_creation_date", "2011-09-02T10:11:00Z");
record.put("ignored_title", "");
record.put("ignored_keywords", "Keyword1 Keyword2");
record.put("ignored_subject", "Subject is here");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "/testPDF.pdf";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/pdf");
record.put("ignored_author", "Bertrand Delacrétaz");
record.put("ignored_creation_date", "2007-09-15T09:02:31Z");
record.put("ignored_title", "Apache Tika - Apache Tika");
record.put("ignored_xmp_creatortool", "Firefox");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "/email.eml";
Map<String, Object> record = new LinkedHashMap();
String name = "Patrick Foo <foo@cloudera.com>";
record.put("ignored__attachment_mimetype", "message/rfc822");
record.put("ignored_author", name);
//record.put("ignored_content_length", "1068");
record.put("ignored_creation_date", "2013-11-27T20:01:23Z");
record.put("ignored_message_from", name);
record.put("ignored_message_to", name);
record.put("ignored_creator", name);
record.put("ignored_dc_creator", name);
record.put("ignored_dc_title", "Test EML");
record.put("ignored_dcterms_created", "2013-11-27T20:01:23Z");
record.put("ignored_meta_author", name);
record.put("ignored_meta_creation_date", "2013-11-27T20:01:23Z");
record.put("ignored_subject", "Test EML");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "/testEXCEL.xlsx";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
record.put("ignored_author", "Keith Bennett");
record.put("ignored_creation_date", "2007-10-01T16:13:56Z");
record.put("ignored_title", "Simple Excel document");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
}
@Test
public void testSolrCellJPGCompressed() throws Exception {
morphline = createMorphline("test-morphlines/solrCellJPGCompressed");
String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] {
path + "/testJPEG_EXIF.jpg",
path + "/testJPEG_EXIF.jpg.gz",
path + "/testJPEG_EXIF.jpg.tar.gz",
//path + "/jpeg2000.jp2",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
@Test
public void testSolrCellXML() throws Exception {
morphline = createMorphline("test-morphlines/solrCellXML");
String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] {
path + "/testXML2.xml",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
@Test
public void testSolrCellDocumentTypes() throws Exception {
morphline = createMorphline("test-morphlines/solrCellDocumentTypes");
String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] {
path + "/testBMPfp.txt",
path + "/boilerplate.html",
path + "/NullHeader.docx",
path + "/testWORD_various.doc",
path + "/testPDF.pdf",
path + "/testJPEG_EXIF.jpg",
path + "/testJPEG_EXIF.jpg.gz",
path + "/testJPEG_EXIF.jpg.tar.gz",
path + "/testXML.xml",
path + "/cars.csv",
// path + "/cars.tsv",
// path + "/cars.ssv",
path + "/cars.csv.gz",
path + "/cars.tar.gz",
path + "/sample-statuses-20120906-141433.avro",
path + "/sample-statuses-20120906-141433",
path + "/sample-statuses-20120906-141433.gz",
path + "/sample-statuses-20120906-141433.bz2",
path + "/email.eml",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
@Test
public void testSolrCellDocumentTypes2() throws Exception {
morphline = createMorphline("test-morphlines/solrCellDocumentTypes");
String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] {
path + "/testPPT_various.ppt",
path + "/testPPT_various.pptx",
path + "/testEXCEL.xlsx",
path + "/testEXCEL.xls",
path + "/testPages.pages",
//path + "/testNumbers.numbers",
//path + "/testKeynote.key",
path + "/testRTFVarious.rtf",
path + "/complex.mbox",
path + "/test-outlook.msg",
path + "/testEMLX.emlx",
path + "/testRFC822",
path + "/rsstest.rss",
// path + "/testDITA.dita",
path + "/testMP3i18n.mp3",
path + "/testAIFF.aif",
path + "/testFLAC.flac",
// path + "/testFLAC.oga",
// path + "/testVORBIS.ogg",
path + "/testMP4.m4a",
path + "/testWAV.wav",
// path + "/testWMA.wma",
path + "/testFLV.flv",
// path + "/testWMV.wmv",
path + "/testBMP.bmp",
path + "/testPNG.png",
path + "/testPSD.psd",
path + "/testSVG.svg",
path + "/testTIFF.tif",
// path + "/test-documents.7z",
// path + "/test-documents.cpio",
// path + "/test-documents.tar",
// path + "/test-documents.tbz2",
// path + "/test-documents.tgz",
// path + "/test-documents.zip",
// path + "/test-zip-of-zip.zip",
// path + "/testJAR.jar",
// path + "/testKML.kml",
// path + "/testRDF.rdf",
path + "/testVISIO.vsd",
// path + "/testWAR.war",
// path + "/testWindows-x86-32.exe",
// path + "/testWINMAIL.dat",
// path + "/testWMF.wmf",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
/**
* Test that the ContentHandler properly strips the illegal characters
*/
@Test
public void testTransformValue() {
String fieldName = "user_name";
assertFalse("foobar".equals(getFoobarWithNonChars()));
Metadata metadata = new Metadata();
// load illegal char string into a metadata field and generate a new document,
// which will cause the ContentHandler to be invoked.
metadata.set(fieldName, getFoobarWithNonChars());
StripNonCharSolrContentHandlerFactory contentHandlerFactory =
new StripNonCharSolrContentHandlerFactory(DateUtil.DEFAULT_DATE_FORMATS);
IndexSchema schema = h.getCore().getLatestSchema();
SolrContentHandler contentHandler =
contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
SolrInputDocument doc = contentHandler.newDocument();
String foobar = doc.getFieldValue(fieldName).toString();
assertTrue("foobar".equals(foobar));
}
/**
* Returns string "foobar" with illegal characters interspersed.
*/
private String getFoobarWithNonChars() {
char illegalChar = '\uffff';
StringBuilder builder = new StringBuilder();
builder.append(illegalChar).append(illegalChar).append("foo").append(illegalChar)
.append(illegalChar).append("bar").append(illegalChar).append(illegalChar);
return builder.toString();
}
}