Package com.bericotech.clavin.extractor

Source Code of com.bericotech.clavin.extractor.ApacheExtractor

package com.bericotech.clavin.extractor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;

/*#####################################################################
*
* CLAVIN (Cartographic Location And Vicinity INdexer)
* ---------------------------------------------------
*
* Copyright (C) 2012-2013 Berico Technologies
* http://clavin.bericotechnologies.com
*
* ====================================================================
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* ====================================================================
*
* ApacheExtractor.java
*
*###################################################################*/

/**
* Extracts location names from unstructured text documents using a
* named entity recognizer (Apache OpenNLP Name Finder).
*
*/
public class ApacheExtractor implements LocationExtractor {
   
    // the actual named entity recognizer (NER) object
    private NameFinderME nameFinder;
   
    // used to tokenize plain text into the OpenNLP format
    private TokenizerME tokenizer;

    // used to split the input into sentences before finding names
    private SentenceDetectorME sentenceDetector;
   
    // resource files used by Apache OpenNLP Name Finder
    private static final String pathToNERModel = "/en-ner-location.bin";
    private static final String pathToTokenizerModel = "/en-token.bin";
    private static final String pathToSentenceDetectorModel = "/en-sent.bin";

   
    /**
     * Builds an {@link ApacheExtractor} by instantiating the OpenNLP
     * Name Finder and Tokenizer.
     *
     * @throws IOException
     */
    public ApacheExtractor() throws IOException {
        nameFinder = new NameFinderME(new TokenNameFinderModel(ApacheExtractor.class.getResourceAsStream(pathToNERModel)));
        tokenizer = new TokenizerME(new TokenizerModel(ApacheExtractor.class.getResourceAsStream(pathToTokenizerModel)));
        sentenceDetector = new SentenceDetectorME(new SentenceModel(ApacheExtractor.class.getResourceAsStream(pathToSentenceDetectorModel)));
    }
   
    /**
     * Extracts location names from unstructured text using the named
     * entity recognizer (NER) feature provided by the Apache OpenNLP
     * Name Finder.
     *
     * @param plainText     Contents of text document
     * @return List of location names and positions
     */
    public List<LocationOccurrence> extractLocationNames(String plainText) {
        if(plainText == null) {
            throw new IllegalArgumentException("plaintext input to extractLocationNames should not be null");
        }

        List<LocationOccurrence> nerResults = new ArrayList<LocationOccurrence>();

        // The values used in these Spans are string character offsets
        Span sentenceSpans[] = sentenceDetector.sentPosDetect(plainText);

        // Each sentence gets processed on its own
        for (Span sentenceSpan : sentenceSpans) {

            // find the start and end position of this sentence in the document
            String sentence = plainText.substring(sentenceSpan.getStart(), sentenceSpan.getEnd());

            // tokenize the text into the required OpenNLP format
            String[] tokens = tokenizer.tokenize(sentence);

            //the values used in these Spans are string character offsets of each token from the sentence beginning
            Span[] tokenPositionsWithinSentence = tokenizer.tokenizePos(sentence);

            // find the location names in the tokenized text
            // the values used in these Spans are NOT string character offsets, they are indices into the 'tokens' array
            Span names[] = nameFinder.find(tokens);


            //for each name that got found, create our corresponding occurrence
            for (Span name : names) {

                //find offsets relative to the start of the sentence
                int beginningOfFirstWord = tokenPositionsWithinSentence[name.getStart()].getStart();
                // -1 because the high end of a Span is noninclusive
                int endOfLastWord = tokenPositionsWithinSentence[name.getEnd() - 1].getEnd();

                //to get offsets relative to the document as a whole, just add the offset for the sentence itself
                int startOffsetInDoc = sentenceSpan.getStart() + beginningOfFirstWord;
                int endOffsetInDoc = sentenceSpan.getStart() + endOfLastWord;

                //look back into the original input string to figure out what the text is that I got a hit on
                String nameInDocument = plainText.substring(startOffsetInDoc, endOffsetInDoc);

                // add to List of results to return
                nerResults.add(new LocationOccurrence(nameInDocument, startOffsetInDoc));
            }

        }

        // this is necessary to maintain consistent results across
        // multiple runs on the same data, which is what we want
        nameFinder.clearAdaptiveData();

        return nerResults;
    }

}
TOP

Related Classes of com.bericotech.clavin.extractor.ApacheExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.