Package pignlproc.evaluation

Source Code of pignlproc.evaluation.SentencesWithLink

package pignlproc.evaluation;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.util.Span;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;

/**
* Parse the text using a sentence tokenizer and for each link found emit a
* tuple with link info contextualized to the enclosing sentence.
*
* Paragraph location is used to ensure that the sentences do not span paragraph
* boundaries.
*
* Sentences are also given a sequential number that can be useful for
* reordering purposes.
*
* If a given sentence hold several link, the output bag will include as one
* sentence tuple for each link.
*/
public class SentencesWithLink extends EvalFunc<DataBag> {

    public static final String ENGLISH_SENTMODEL_PATH = "opennlp/en-sent.bin";

    TupleFactory tupleFactory = TupleFactory.getInstance();

    BagFactory bagFactory = BagFactory.getInstance();

    protected final SentenceModel model;

    public SentencesWithLink() throws IOException {
        ClassLoader loader = getClass().getClassLoader();
        // TODO: un-hardcode the model language
        String path = ENGLISH_SENTMODEL_PATH;
        InputStream in = loader.getResourceAsStream(path);
        if (in == null) {
            String message = String.format("Failed to find resource for model"
                    + " sentence detection model: %s", path);
            log.error(message);
            throw new IOException(message);
        }
        model = new SentenceModel(in);
    }

    @SuppressWarnings("unchecked")
    @Override
    public DataBag exec(Tuple input) throws IOException {
        SentenceDetectorME sentenceDetector = new SentenceDetectorME(model);
        DataBag output = bagFactory.newDefaultBag();
        Object t0 = input.get(0);
        if (!(t0 instanceof String)) {
            throw new IOException("Expected input to be chararray, but got "
                    + t0.getClass().getName());
        }
        Object t1 = input.get(1);
        if (!(t1 instanceof DataBag)) {
            throw new IOException("Expected input to be bag of links, but got "
                    + t1.getClass().getName());
        }
        Object t2 = input.get(2);
        if (!(t1 instanceof DataBag)) {
            throw new IOException(
                    "Expected input to be bag of paragraphs, but got "
                            + t2.getClass().getName());
        }
        String text = (String) t0;
        DataBag links = (DataBag) t1;
        DataBag paragraphBag = (DataBag) t2;

        // convert the bag of links as absolute spans over the text
        List<Span> linkSpans = new ArrayList<Span>();
        for (Tuple l : links) {
            linkSpans.add(new Span((Integer) l.get(1), (Integer) l.get(2),
                    (String) l.get(0)));
        }
        Collections.sort(linkSpans);

        // iterate of the paragraph and extract sentence locations
        int order = 0;
        for (Tuple p : paragraphBag) {
            Integer beginParagraph = (Integer) p.get(1);
            Integer endParagraph = (Integer) p.get(2);
            Span[] spans = sentenceDetector.sentPosDetect(text.substring(
                    beginParagraph, endParagraph));
            for (Span sentenceRelative : spans) {
                // for each sentence found in that paragraph, compute the
                // absolute span of the text
                order++;
                Span absoluteSentence = new Span(beginParagraph
                        + sentenceRelative.getStart(), beginParagraph
                        + sentenceRelative.getEnd(), sentenceRelative.getType());

                String sentence = text.substring(absoluteSentence.getStart(),
                        absoluteSentence.getEnd());
                // replace some formatting white-spaces without changing the
                // number of chars not to break the annotations
                sentence = sentence.replaceAll("\n", " ");
                sentence = sentence.replaceAll("\t", " ");

                // for each link in that sentence, emit a tuple
                for (Span link : linkSpans) {
                    // TODO: optimize me by leveraging the link ordering
                    if (absoluteSentence.contains(link)) {
                        int begin = link.getStart()
                                - absoluteSentence.getStart();
                        int end = link.getEnd() - absoluteSentence.getStart();
                        output.add(tupleFactory.newTupleNoCopy(Arrays.asList(
                                order, sentence, link.getType(), begin, end)));
                    } else if (link.compareTo(absoluteSentence) > 1) {
                        break;
                    }
                }
            }
        }
        return output;

    }

    public Schema outputSchema(Schema input) {
        try {
            Schema tupleSchema = new Schema();
            tupleSchema.add(new FieldSchema("sentenceOrder", DataType.INTEGER));
            tupleSchema.add(new FieldSchema("sentence", DataType.CHARARRAY));
            tupleSchema.add(new FieldSchema("linkTarget", DataType.CHARARRAY));
            tupleSchema.add(new FieldSchema("linkBegin", DataType.INTEGER));
            tupleSchema.add(new FieldSchema("linkEnd", DataType.INTEGER));
            return new Schema(new Schema.FieldSchema(getSchemaName(
                    this.getClass().getName().toLowerCase(), input),
                    tupleSchema, DataType.BAG));
        } catch (Exception e) {
            return null;
        }
    }

}
TOP

Related Classes of pignlproc.evaluation.SentencesWithLink

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.