Package org.apache.tika.parser.odf

Source Code of org.apache.tika.parser.odf.ODFParserTest

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.InputStream;

import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.opendocument.OpenOfficeParser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;

public class ODFParserTest extends TikaTest {
    /**
     * For now, allow us to run some tests against both
     *  the old and the new parser
     */
    private Parser[] getParsers() {
       return new Parser[] {
             new OpenDocumentParser(),
             new OpenOfficeParser()
       };
    }

    @Test
    public void testOO3() throws Exception {
       for (Parser parser : getParsers()) {
          InputStream input = ODFParserTest.class.getResourceAsStream(
                "/test-documents/testODFwithOOo3.odt");
          try {
             Metadata metadata = new Metadata();
             ContentHandler handler = new BodyContentHandler();
             parser.parse(input, handler, metadata, new ParseContext());

             assertEquals(
                   "application/vnd.oasis.opendocument.text",
                   metadata.get(Metadata.CONTENT_TYPE));

             String content = handler.toString();
             assertTrue(content.contains("Tika is part of the Lucene project."));
             assertTrue(content.contains("Solr"));
             assertTrue(content.contains("one embedded"));
             assertTrue(content.contains("Rectangle Title"));
             assertTrue(content.contains("a blue background and dark border"));       
          } finally {
             input.close();
          }
       }
    }

    @Test
    public void testOO2() throws Exception {
       for (Parser parser : getParsers()) {
          InputStream input = ODFParserTest.class.getResourceAsStream(
                 "/test-documents/testOpenOffice2.odt");
          try {
             Metadata metadata = new Metadata();
             ContentHandler handler = new BodyContentHandler();
             parser.parse(input, handler, metadata, new ParseContext());

             assertEquals(
                   "application/vnd.oasis.opendocument.text",
                   metadata.get(Metadata.CONTENT_TYPE));
             assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
             assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
             assertEquals(
                   "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
                   metadata.get("generator"));
            
             // Check date metadata, both old-style and new-style
             assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
             assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED));
             assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
             assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
             assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
            
             // Check the document statistics
             assertEquals("1", metadata.get(Office.PAGE_COUNT));
             assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
             assertEquals("14", metadata.get(Office.WORD_COUNT));
             assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
             assertEquals("0", metadata.get(Office.TABLE_COUNT));
             assertEquals("0", metadata.get(Office.OBJECT_COUNT));
             assertEquals("0", metadata.get(Office.IMAGE_COUNT));
            
             // Check the Tika-1.0 style document statistics
             assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
             assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
             assertEquals("14", metadata.get(Metadata.WORD_COUNT));
             assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
             assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
             assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
             assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
            
             // Check the very old style statistics (these will be removed shortly)
             assertEquals("0", metadata.get("nbTab"));
             assertEquals("0", metadata.get("nbObject"));
             assertEquals("0", metadata.get("nbImg"));
             assertEquals("1", metadata.get("nbPage"));
             assertEquals("1", metadata.get("nbPara"));
             assertEquals("14", metadata.get("nbWord"));
             assertEquals("78", metadata.get("nbCharacter"));

             // Custom metadata tags present but without values
             assertEquals(null, metadata.get("custom:Info 1"));
             assertEquals(null, metadata.get("custom:Info 2"));
             assertEquals(null, metadata.get("custom:Info 3"));
             assertEquals(null, metadata.get("custom:Info 4"));

             String content = handler.toString();
             assertTrue(content.contains(
                   "This is a sample Open Office document,"
                   + " written in NeoOffice 2.2.1 for the Mac."));
          } finally {
             input.close();
          }
       }
   }

   /**
    * Similar to {@link #testXMLParser()}, but using a different
    *  OO2 file with different metadata in it
    */
    @Test
   public void testOO2Metadata() throws Exception {
      InputStream input = ODFParserTest.class.getResourceAsStream(
            "/test-documents/testOpenOffice2.odf");
      try {
           Metadata metadata = new Metadata();
           ContentHandler handler = new BodyContentHandler();
           new OpenDocumentParser().parse(input, handler, metadata);
 
           assertEquals(
                   "application/vnd.oasis.opendocument.formula",
                   metadata.get(Metadata.CONTENT_TYPE));
           assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
           assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
           assertEquals("The quick brown fox jumps over the lazy dog",
                   metadata.get(TikaCoreProperties.TITLE));
           assertEquals("Gym class featuring a brown fox and lazy dog",
                   metadata.get(TikaCoreProperties.DESCRIPTION));
           assertEquals("Gym class featuring a brown fox and lazy dog",
                   metadata.get(OfficeOpenXMLCore.SUBJECT));
           assertEquals("Gym class featuring a brown fox and lazy dog",
                   metadata.get(Metadata.SUBJECT));
           assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
           assertEquals("1", metadata.get("editing-cycles"));
           assertEquals(
                   "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
                   metadata.get("generator"));
           assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
          
           // User defined metadata
           assertEquals("Text 1", metadata.get("custom:Info 1"));
           assertEquals("2", metadata.get("custom:Info 2"));
           assertEquals("false", metadata.get("custom:Info 3"));
           assertEquals("true", metadata.get("custom:Info 4"));
          
           // No statistics present
           assertEquals(null, metadata.get(Metadata.PAGE_COUNT));
           assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
           assertEquals(null, metadata.get(Metadata.WORD_COUNT));
           assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
           assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
           assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
           assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
           assertEquals(null, metadata.get("nbTab"));
           assertEquals(null, metadata.get("nbObject"));
           assertEquals(null, metadata.get("nbImg"));
           assertEquals(null, metadata.get("nbPage"));
           assertEquals(null, metadata.get("nbPara"));
           assertEquals(null, metadata.get("nbWord"));
           assertEquals(null, metadata.get("nbCharacter"));
 
           // Note - contents of maths files not currently supported
           String content = handler.toString();
           assertEquals("", content);
      } finally {
          input.close();
      }
   }

   /**
    * Similar to {@link #testXMLParser()}, but using an OO3 file
    */
    @Test
   public void testOO3Metadata() throws Exception {
      InputStream input = ODFParserTest.class.getResourceAsStream(
            "/test-documents/testODFwithOOo3.odt");
      try {
           Metadata metadata = new Metadata();
           ContentHandler handler = new BodyContentHandler();
           new OpenDocumentParser().parse(input, handler, metadata);
 
           assertEquals(
                   "application/vnd.oasis.opendocument.text",
                   metadata.get(Metadata.CONTENT_TYPE));
           assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
           assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
           assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
           assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
           assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
           assertEquals("Test document", metadata.get(Metadata.SUBJECT));
           assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
           assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
           assertEquals("Bart Hanssens", metadata.get("initial-creator"));
           assertEquals("2", metadata.get("editing-cycles"));
           assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
           assertEquals(
                   "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
                   metadata.get("generator"));
           assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
          
           // User defined metadata
           assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
           assertEquals(null, metadata.get("custom:Info 2"));
           assertEquals(null, metadata.get("custom:Info 3"));
           assertEquals(null, metadata.get("custom:Info 4"));
          
           // Check the document statistics
           assertEquals("2", metadata.get(Office.PAGE_COUNT));
           assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
           assertEquals("54", metadata.get(Office.WORD_COUNT));
           assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
           assertEquals("0", metadata.get(Office.TABLE_COUNT));
           assertEquals("2", metadata.get(Office.OBJECT_COUNT));
           assertEquals("0", metadata.get(Office.IMAGE_COUNT));
          
           // Check the Tika-1.0 style document statistics
           assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
           assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
           assertEquals("54", metadata.get(Metadata.WORD_COUNT));
           assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT));
           assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
           assertEquals("2", metadata.get(Metadata.OBJECT_COUNT));
           assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
          
           // Check the old style statistics (these will be removed shortly)
           assertEquals("0", metadata.get("nbTab"));
           assertEquals("2", metadata.get("nbObject"));
           assertEquals("0", metadata.get("nbImg"));
           assertEquals("2", metadata.get("nbPage"));
           assertEquals("13", metadata.get("nbPara"));
           assertEquals("54", metadata.get("nbWord"));
           assertEquals("351", metadata.get("nbCharacter"));
 
           String content = handler.toString();
           assertTrue(content.contains(
                 "Apache Tika Tika is part of the Lucene project."
           ));
      } finally {
          input.close();
      }
   }

    @Test
    public void testODPMasterFooter() throws Exception {
        InputStream input = ODFParserTest.class.getResourceAsStream(
            "/test-documents/testMasterFooter.odp");
        try {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new AutoDetectParser().parse(input, handler, metadata);
 
            String content = handler.toString();
            assertContains("Master footer is here", content);
        } finally {
            input.close();
        }
    } 

    @Test
    public void testODTFooter() throws Exception {
        InputStream input = ODFParserTest.class.getResourceAsStream(
            "/test-documents/testFooter.odt");
        try {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new AutoDetectParser().parse(input, handler, metadata);
 
            String content = handler.toString();
            assertContains("Here is some text...", content);
            assertContains("Here is some text on page 2", content);
            assertContains("Here is footer text", content);
        } finally {
            input.close();
        }
    } 

    @Test
    public void testODSFooter() throws Exception {
        InputStream input = ODFParserTest.class.getResourceAsStream(
            "/test-documents/testFooter.ods");
        try {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new AutoDetectParser().parse(input, handler, metadata);
 
            String content = handler.toString();
            assertContains("Here is a footer in the center area", content);
        } finally {
            input.close();
        }
    } 
}
TOP

Related Classes of org.apache.tika.parser.odf.ODFParserTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.