Package org.apache.nutch.parse.mspowerpoint

Source Code of org.apache.nutch.parse.mspowerpoint.TestMSPowerPointParser

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.parse.mspowerpoint;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

import junit.framework.TestCase;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.util.NutchConfiguration;

import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;

/**
* <p>
* Unit tests for MSPowerPointParser.
* </p>
* <p>
* Make sure sample files are copied to "test.data" as specified in
* ./src/plugin/parse-mspowerpoint/build.xml during plugin compilation. Check
* ./src/plugin/parse-mspowerpoint/sample/README.txt for what they are.
* </p>
*
* @author Stephan Strittmatter - http://www.sybit.de
*
* @version 1.0
*/
public class TestMSPowerPointParser extends TestCase {
  private static final Log LOG = LogFactory.getLog(TestMSPowerPointParser.class);

  private static final String CHARSET = "UTF-8";

  private final static String LINE_SEPARATOR = System.getProperty("line.separator");

  /** This system property is defined in ./src/plugin/build-plugin.xml */
  private final static String SAMPLE_DIR = System.getProperty("test.data",
      "build/parse-mspowerpoint/test/data");

  private final File sampleDir = new File(SAMPLE_DIR);

  /**
   * Wether dumping the extracted data to file for visual checks.
   */
  private final static boolean DUMP_TO_FILE = false;

  private final File testFile;

  private String urlString;

  private Protocol protocol;

  private Content content;

  /**
   *
   * @param name
   */
  public TestMSPowerPointParser(String name) {
    super(name);
    this.testFile = new File(this.sampleDir, "test.ppt");
  }

  /**
   * @param file
   */
  public TestMSPowerPointParser(File file) {
    super();
    this.testFile = file;
  }

  /**
   * @see TestCase#setUp()
   */
  protected void setUp() throws Exception {
    super.setUp();

    this.urlString = createUrl(this.testFile.getName());

    System.out.println("Testing file: " + this.urlString + "...");
    this.protocol =new ProtocolFactory(NutchConfiguration.create()).getProtocol(this.urlString);
    this.content = this.protocol.getProtocolOutput(new Text(this.urlString), new CrawlDatum()).getContent();
  }

  /**
   * @see TestCase#tearDown()
   */
  protected void tearDown() throws Exception {
    super.tearDown();
  }

  /**
   * Testing all available ppt-docs stored in dir <code>SAMPLE_DIR</code> if
   * parsable without exceptions.
   *
   * @see #SAMPLE_DIR
   * @throws Exception
   */
  public void testContent() throws Exception {

    Parse parse = new ParseUtil(NutchConfiguration.create())
                        .parseByExtensionId("parse-mspowerpoint", this.content);

    ParseData data = parse.getData();
    String text = parse.getText();

    assertTrue("No content extracted length ==0", text.length() > 0);
   
    this.dumpToFile(this.testFile.getName(), data, text);

    final FileExtensionFilter contentFilter = new FileExtensionFilter(
        this.testFile.getName() + ".content");
    final File[] contentFiles = this.sampleDir.listFiles(contentFilter);

    if (contentFiles.length > 0) {
      String testContent = this.fileToString(contentFiles[0]);

      for (int i = 0; i < text.length(); i++) {
        char parsedChar = text.charAt(i);
        char testChar = testContent.charAt(i);
        assertEquals("Wrong char at position [" + i + "]", "" + testChar, ""
            + parsedChar);
      }
    } else {
      LOG.info("Comparison file for Content not available: "
          + this.testFile.getName() + ".content");
    }
  }

  /**
   * Testing all available ppt-docs stored in dir <code>SAMPLE_DIR</code> if
   * parsable without exceptions.
   *
   * @see #SAMPLE_DIR
   * @throws Exception
   */
  public void testMeta() throws Exception {

    Parse parse = new ParseUtil(NutchConfiguration.create())
                        .parseByExtensionId("parse-mspowerpoint", content);
   
    ParseData data = parse.getData();

    final FileExtensionFilter titleFilter = new FileExtensionFilter(
        this.testFile.getName() + ".meta");
    final File[] titleFiles = this.sampleDir.listFiles(titleFilter);

    if (titleFiles.length > 0) {
      assertEquals("Document Title", this.fileToString(titleFiles[0]),
          "Title: " + data.getTitle() + LINE_SEPARATOR +
          "Outlinks: " + data.getOutlinks().length + LINE_SEPARATOR);
    } else {
      assertTrue("Document Title length ==0", data.getTitle().length() > 0);
      LOG.info("Comparison file for Title not available: "
          + this.testFile.getName() + ".meta");
    }
  }

  /**
   * create complete url
   *
   * @param fileName
   *          name of the file
   * @return complete url.
   */
  private String createUrl(final String fileName) {
    return "file:" + SAMPLE_DIR + "/" + fileName;
  }

  /**
   * Dump the parsed data to a UTF-8 formatted file for visual checks.
   *
   * @param data
   * @param text
   * @param fileName
   * @throws IOException
   */
  private void dumpToFile(final String fileName, final ParseData data,
      final String text) throws IOException {
    if (TestMSPowerPointParser.DUMP_TO_FILE) {

      final File file = new File(fileName + ".txt");

      final FileOutputStream fos = new FileOutputStream(file);
      final OutputStreamWriter osw = new OutputStreamWriter(fos, CHARSET);

      osw.write(data.toString());
      osw.write(text);

      osw.close();
      fos.close();
    }
  }

  /**
   * Load the testfiles for comparison.
   *
   * @param file
   *          file to load
   * @return UNF-8 encoded String content of file.
   * @throws IOException
   */
  private String fileToString(final File file) throws IOException {
    FileInputStream fis = null;
    //InputStreamReader isr = null;
    BufferedReader br = null;
    final StringBuffer buf = new StringBuffer();

    try {
      fis = new FileInputStream(file);
      br = new BufferedReader(new InputStreamReader(fis, CHARSET));

      String line = br.readLine();
      while (line != null) {
        buf.append(line).append(LINE_SEPARATOR);
        line = br.readLine();
      }
    } finally {
      if (br != null) {
        br.close();
      }
      if (fis != null) {
        fis.close();
      }
    }

    String val = buf.toString();

    return val;
  }

}
TOP

Related Classes of org.apache.nutch.parse.mspowerpoint.TestMSPowerPointParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.