Package com.digitalpebble.behemoth.languageidentification

Source Code of com.digitalpebble.behemoth.languageidentification.LanguageIDProcessorTest

package com.digitalpebble.behemoth.languageidentification;

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import junit.framework.TestCase;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;

import com.digitalpebble.behemoth.BehemothConfiguration;
import com.digitalpebble.behemoth.BehemothDocument;
import com.digitalpebble.behemoth.languageidentification.LanguageIdProcessor;

public class LanguageIDProcessorTest extends TestCase {

    LanguageIdProcessor langid = null;

    public void setUp() throws Exception {
        Configuration conf = BehemothConfiguration.create();
        langid = new LanguageIdProcessor();
        langid.setConf(conf);
    }

    public void tearDown() throws Exception {

    }

    public void testLanguageID() {
        String text = "Ceci est un texte en Francais, un peu court mais ca devrait etre assez pour la detection.";
        assertEquals("fr", testLanguage(text));
        text = "This text is in English, it is a bit short but it should be enough for detecting its language";
        assertEquals("en", testLanguage(text));
    }

    private String testLanguage(String text) {
        // Create a very simple Behemoth document
        String url = "dummyDoc.html";
        BehemothDocument doc = new BehemothDocument();
        doc.setContent(text.getBytes());
        doc.setText(text);
        doc.setUrl(url);
        // don't set the text as such
        // or the content type
        BehemothDocument[] outputs = langid.process(doc, null);
        // the output should contain only one document
        assertEquals(1, outputs.length);
        BehemothDocument output = outputs[0];
        // the output document should have a language metadata
        // and its value should be french
        Writable lang = output.getMetadata().get(
                LanguageIdProcessor.languageMDKey);
        return lang.toString();
    }

}
TOP

Related Classes of com.digitalpebble.behemoth.languageidentification.LanguageIDProcessorTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.