Package org.kitesdk.morphline.solrcell

Source Code of org.kitesdk.morphline.solrcell.SolrCellMorphlineTest

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.morphline.solrcell;

import java.io.File;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;

import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.schema.IndexSchema;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
import org.kitesdk.morphline.solr.AbstractSolrMorphlineTest;

public class SolrCellMorphlineTest extends AbstractSolrMorphlineTest {

  private Map<String,Integer> expectedRecords = new HashMap();
  private Map<String, Map<String, Object>> expectedRecordContents = new HashMap();

  @Before
  public void setUp() throws Exception {
    super.setUp();
   
    String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
    expectedRecords.put(path + "sample-statuses-20120906-141433.avro", 2);
    expectedRecords.put(path + "sample-statuses-20120906-141433", 2);
    expectedRecords.put(path + "sample-statuses-20120906-141433.gz", 2);
    expectedRecords.put(path + "sample-statuses-20120906-141433.bz2", 2);
    expectedRecords.put(path + "cars.csv", 6);
    expectedRecords.put(path + "cars.csv.gz", 6);
    expectedRecords.put(path + "cars.tar.gz", 4);
    expectedRecords.put(path + "cars.tsv", 6);
    expectedRecords.put(path + "cars.ssv", 6);
    expectedRecords.put(path + "test-documents.7z", 9);
    expectedRecords.put(path + "test-documents.cpio", 9);
    expectedRecords.put(path + "test-documents.tar", 9);
    expectedRecords.put(path + "test-documents.tbz2", 9);
    expectedRecords.put(path + "test-documents.tgz", 9);
    expectedRecords.put(path + "test-documents.zip", 9);
    expectedRecords.put(path + "multiline-stacktrace.log", 4);
   
    {
      Map<String, Object> record = new LinkedHashMap();
      record.put("ignored__attachment_mimetype", "image/jpeg");
      record.put("ignored_exif_isospeedratings", "400");
      record.put("ignored_meta_creation_date", "2009-08-11T09:09:45");
      record.put("ignored_tiff_model", "Canon EOS 40D");
      record.put("text", NON_EMPTY_FIELD);
      expectedRecordContents.put("/testJPEG_EXIF.jpg", record);
      expectedRecordContents.put("/testJPEG_EXIF.jpg.tar", record);
      expectedRecordContents.put("/testJPEG_EXIF.jpg.tar.gz", record);
    }
   
    {
      String file = path + "testWORD_various.doc";
      Map<String, Object> record = new LinkedHashMap();
      record.put("ignored__attachment_mimetype", "application/msword");
      record.put("ignored_author", "Michael McCandless");
      record.put("ignored_creation_date", "2011-09-02T10:11:00Z");
      record.put("ignored_title", "");
      record.put("ignored_keywords", "Keyword1 Keyword2");
      record.put("ignored_subject", "Subject is here");
      record.put("text", NON_EMPTY_FIELD);
      expectedRecordContents.put(file, record);
    }
   
    {
      String file = path + "testPDF.pdf";
      Map<String, Object> record = new LinkedHashMap();
      record.put("ignored__attachment_mimetype", "application/pdf");
      record.put("ignored_author", "Bertrand Delacrétaz");
      record.put("ignored_creation_date", "2007-09-15T09:02:31Z");
      record.put("ignored_title", "Apache Tika - Apache Tika");
      record.put("ignored_xmp_creatortool", "Firefox");
      record.put("text", NON_EMPTY_FIELD);
      expectedRecordContents.put(file, record);
    }
   
    {
      String file = path + "email.eml";
      Map<String, Object> record = new LinkedHashMap();
      String name = "Patrick Foo <foo@cloudera.com>";
      record.put("ignored__attachment_mimetype", "message/rfc822");
      record.put("ignored_author", name);
      //record.put("ignored_content_length", "1068");
      record.put("ignored_creation_date", "2013-11-27T20:01:23Z");
      record.put("ignored_message_from", name);
      record.put("ignored_message_to", name);
      record.put("ignored_creator", name);
      record.put("ignored_dc_creator", name);
      record.put("ignored_dc_title", "Test EML");
      record.put("ignored_dcterms_created", "2013-11-27T20:01:23Z");
      record.put("ignored_meta_author", name);
      record.put("ignored_meta_creation_date", "2013-11-27T20:01:23Z");
      record.put("ignored_subject", "Test EML");
      record.put("text", NON_EMPTY_FIELD);
      expectedRecordContents.put(file, record);
    }

    {
      String file = path + "testEXCEL.xlsx";
      Map<String, Object> record = new LinkedHashMap();
      record.put("ignored__attachment_mimetype", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
      record.put("ignored_author", "Keith Bennett");
      record.put("ignored_creation_date", "2007-10-01T16:13:56Z");
      record.put("ignored_title", "Simple Excel document");
      record.put("text", NON_EMPTY_FIELD);
      expectedRecordContents.put(file, record);
    }   
   
  }
 
  @Test
  public void testSolrCellJPGCompressed() throws Exception {
    morphline = createMorphline("test-morphlines" + File.separator + "solrCellJPGCompressed");   
    String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
    String[] files = new String[] {
        path + "testJPEG_EXIF.jpg",
        path + "testJPEG_EXIF.jpg.gz",
        path + "testJPEG_EXIF.jpg.tar.gz",
        //path + "jpeg2000.jp2",
    };
    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
 

  @Test
  public void testSolrCellXML() throws Exception {
    morphline = createMorphline("test-morphlines" + File.separator + "solrCellXML");   
    String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
    String[] files = new String[] {
        path + "testXML2.xml",
    };
    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
 

  @Test
  public void testSolrCellDocumentTypes() throws Exception {
    morphline = createMorphline("test-morphlines" + File.separator + "solrCellDocumentTypes");   
    String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
    String[] files = new String[] {
        path + "testBMPfp.txt",
        path + "boilerplate.html",
        path + "NullHeader.docx",
        path + "testWORD_various.doc",         
        path + "testPDF.pdf",
        path + "testJPEG_EXIF.jpg",
        path + "testJPEG_EXIF.jpg.gz",
        path + "testJPEG_EXIF.jpg.tar.gz",
        path + "testXML.xml",         
        path + "cars.csv",
//        path + "cars.tsv",
//        path + "cars.ssv",
        path + "cars.csv.gz",
        path + "cars.tar.gz",
        path + "sample-statuses-20120906-141433.avro",
        path + "sample-statuses-20120906-141433",
        path + "sample-statuses-20120906-141433.gz",
        path + "sample-statuses-20120906-141433.bz2",
        path + "email.eml",
        path + "mime.eml",
    };
    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
  }
 
  @Test
  public void testSolrCellDocumentTypes2() throws Exception {
    morphline = createMorphline("test-morphlines" + File.separator + "solrCellDocumentTypes");   
    String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
    String[] files = new String[] {
        path + "testPPT_various.ppt",
        path + "testPPT_various.pptx",       
        path + "testEXCEL.xlsx",
        path + "testEXCEL.xls",
        path + "testPages.pages",
        //path + "testNumbers.numbers",
        //path + "testKeynote.key",
       
        path + "testRTFVarious.rtf",
        path + "complex.mbox",
        path + "test-outlook.msg",
        path + "testEMLX.emlx",
        path + "testRFC822"
        path + "rsstest.rss",
//        path + "testDITA.dita",
       
        path + "testMP3i18n.mp3",
        path + "testAIFF.aif",
        path + "testFLAC.flac",
//        path + "testFLAC.oga",
//        path + "testVORBIS.ogg", 
        path + "testMP4.m4a",
        path + "testWAV.wav",
//        path + "testWMA.wma",
       
        path + "testFLV.flv",
//        path + "testWMV.wmv",
       
        path + "testBMP.bmp",
        path + "testPNG.png",
        path + "testPSD.psd",       
        path + "testSVG.svg"
        path + "testTIFF.tif",    

//        path + "test-documents.7z",
//        path + "test-documents.cpio",
//        path + "test-documents.tar",
//        path + "test-documents.tbz2",
//        path + "test-documents.tgz",
//        path + "test-documents.zip",
//        path + "test-zip-of-zip.zip",
//        path + "testJAR.jar",
       
//        path + "testKML.kml",
//        path + "testRDF.rdf",
        path + "testVISIO.vsd",
//        path + "testWAR.war",
//        path + "testWindows-x86-32.exe",
//        path + "testWINMAIL.dat",
//        path + "testWMF.wmf",
    };  
    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
  }

  /**
   * Test that the ContentHandler properly strips the illegal characters
   */
  @Test
  public void testTransformValue() {
    String fieldName = "user_name";
    assertFalse("foobar".equals(getFoobarWithNonChars()));

    Metadata metadata = new Metadata();
    // load illegal char string into a metadata field and generate a new document,
    // which will cause the ContentHandler to be invoked.
    metadata.set(fieldName, getFoobarWithNonChars());
    StripNonCharSolrContentHandlerFactory contentHandlerFactory =
      new StripNonCharSolrContentHandlerFactory(DateUtil.DEFAULT_DATE_FORMATS);
    IndexSchema schema = h.getCore().getLatestSchema();
    SolrContentHandler contentHandler =
      contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
    SolrInputDocument doc = contentHandler.newDocument();
    String foobar = doc.getFieldValue(fieldName).toString();
    assertTrue("foobar".equals(foobar));
  }

  /**
   * Returns string "foobar" with illegal characters interspersed.
   */
  private String getFoobarWithNonChars() {
    char illegalChar = '\uffff';
    StringBuilder builder = new StringBuilder();
    builder.append(illegalChar).append(illegalChar).append("foo").append(illegalChar)
      .append(illegalChar).append("bar").append(illegalChar).append(illegalChar);
    return builder.toString();
  }

}
TOP

Related Classes of org.kitesdk.morphline.solrcell.SolrCellMorphlineTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.