Package org.apache.nutch.parse

Source Code of org.apache.nutch.parse.TestParserFactory

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.parse;

// JUnit imports
import junit.framework.TestCase;

// Nutch imports
import org.apache.nutch.plugin.Extension;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;

/**
* Unit test for new parse plugin selection.
*
* @author Sebastien Le Callonnec
* @version 1.0
*/
public class TestParserFactory extends TestCase {
 
  private Configuration conf;
  private ParserFactory parserFactory;
   
  public TestParserFactory(String name) { super(name); }

  /** Inits the Test Case with the test parse-plugin file */
  protected void setUp() throws Exception {
      conf = NutchConfiguration.create();
      conf.set("plugin.includes", ".*");
      conf.set("parse.plugin.file",
               "org/apache/nutch/parse/parse-plugin-test.xml");
      parserFactory = new ParserFactory(conf);
  }
   
  /** Unit test for <code>getExtensions(String)</code> method. */
  public void testGetExtensions() throws Exception {
    Extension ext = parserFactory.getExtensions("text/html").get(0);
    assertEquals("parse-tika", ext.getDescriptor().getPluginId());
    ext = parserFactory.getExtensions("text/html; charset=ISO-8859-1").get(0);
    assertEquals("parse-tika", ext.getDescriptor().getPluginId());
    ext = parserFactory.getExtensions("foo/bar").get(0);
    assertEquals("parse-tika", ext.getDescriptor().getPluginId());
  }
 
  /** Unit test to check <code>getParsers</code> method */
  public void testGetParsers() throws Exception {
    Parser [] parsers = parserFactory.getParsers("text/html", "http://foo.com");
    assertNotNull(parsers);
    assertEquals(1, parsers.length);
    assertEquals("org.apache.nutch.parse.tika.TikaParser",
                 parsers[0].getClass().getName());

    parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1",
                                       "http://foo.com");
    assertNotNull(parsers);
    assertEquals(1, parsers.length);
    assertEquals("org.apache.nutch.parse.tika.TikaParser",
                 parsers[0].getClass().getName());
   
    parsers = parserFactory.getParsers("application/x-javascript",
                                       "http://foo.com");
    assertNotNull(parsers);
    assertEquals(1, parsers.length);
    assertEquals("org.apache.nutch.parse.js.JSParseFilter",
                 parsers[0].getClass().getName());
   
    parsers = parserFactory.getParsers("text/plain", "http://foo.com");
    assertNotNull(parsers);
    assertEquals(1, parsers.length);
    assertEquals("org.apache.nutch.parse.tika.TikaParser",
                 parsers[0].getClass().getName());
   
    Parser parser1 = parserFactory.getParsers("text/plain", "http://foo.com")[0];
    Parser parser2 = parserFactory.getParsers("*", "http://foo.com")[0];
  
    assertEquals("Different instances!", parser1.hashCode(), parser2.hashCode());
   
    //test and make sure that the rss parser is loaded even though its plugin.xml
    //doesn't claim to support text/rss, only application/rss+xml
    parsers = parserFactory.getParsers("text/rss","http://foo.com");
    assertNotNull(parsers);
    assertEquals(1,parsers.length);
    assertEquals("org.apache.nutch.parse.tika.TikaParser",
                 parsers[0].getClass().getName());
  }
}
TOP

Related Classes of org.apache.nutch.parse.TestParserFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.