Source Code of org.dbpedia.spotlight.io.DatasetSplitter$BySurfaceForm

/**
 * Copyright 2011 Pablo Mendes, Max Jakob
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.dbpedia.spotlight.io;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.store.FSDirectory;
import org.dbpedia.spotlight.exceptions.SearchException;
import org.dbpedia.spotlight.lucene.LuceneManager;
import org.dbpedia.spotlight.lucene.search.LuceneCandidateSearcher;
import org.dbpedia.spotlight.model.DBpediaResource;
import org.dbpedia.spotlight.model.SpotlightConfiguration;
import org.dbpedia.spotlight.model.SurfaceForm;
import org.semanticweb.yars.nx.Node;
import org.semanticweb.yars.nx.parser.NxParser;
import org.semanticweb.yars.nx.parser.ParseException;


import java.io.*;
import java.util.*;


/**
 * Created by IntelliJ IDEA.
 * User: PabloMendes
 * Date: Jul 23, 2010
 * Time: 3:53:58 PM
 * To change this template use File | Settings | File Templates.
 */
public abstract class DatasetSplitter {


    Log LOG = LogFactory.getLog(this.getClass());
    int incrementalId = 0;


    Writer mTrainingSetWriter;
    Writer mTestSetWriter;


    /**
     * Abstract constructor. Please see @link{BySize} and @link{BySurfaceForm}}
     * @param trainingSetFile
     * @param testSetFile
     * @throws IOException
     */
    public DatasetSplitter(File trainingSetFile, File testSetFile) throws IOException {
        this.mTrainingSetWriter = new BufferedWriter(new FileWriter(trainingSetFile));
        this.mTestSetWriter = new BufferedWriter(new FileWriter(testSetFile));
    }


    public abstract boolean shouldKeepTheseOccurrences(List<String> items);


    public abstract void split(List<String> items) throws IOException;


    //TODO Max: question: does this assume sorting by URI?
    public void run(InputStream stream) throws IOException {
        String currentItem = "";
        List<String> items = new ArrayList<String>();
        Scanner scanner = new Scanner(new InputStreamReader(stream, "UTF-8"));
        int nItemsKept = 0;
        while (scanner.hasNext()) {
            String line = scanner.nextLine();
            incrementalId++;


            if (line==null || line.trim().equals(""))
                continue;


            String[] fields = line.split("\t");
            String uri;
            if (fields.length >= 5) {
                uri = fields[0];
            }
            else {
                uri = fields[1];
            }
//                    String surfaceForm = fields[1];
//                    String context = fields[2];
//                    String offset = fields[3];
//                    String type = fields[4];


            //Tuple5<String,String,String,String,String> t = new Tuple5<String,String,String,String,String>(surfaceForm, uri, context, offset, type);


            if ( !uri.equals(currentItem)){


                if (shouldKeepTheseOccurrences(items)) {
                    nItemsKept++;
                    LOG.trace("End of current item: "+currentItem+" / size: "+items.size()+" - saving!");
                    split(items);
                } // else ignore
                //reset current item
                currentItem = uri;
                items = new ArrayList<String>();
            }
            items.add(line.toString());


            if (incrementalId % 50000 == 0)
                LOG.info("Processed "+incrementalId+" occurrences. Kept occurrences for "+nItemsKept+" URIs.");
        }
        scanner.close();
        LOG.info("Processed "+incrementalId+" occurrences. Kept occurrences for "+nItemsKept+" URIs");
    }


//    public void run(File f) {
//        LOG.info("Loading occurrences from "+f.getPath());
//        String currentItem = "";
//        //Set<Tuple5> items = new HashSet<Tuple5>();
//        List<String> items = new ArrayList<String>();
//
//        if (f.getName().length() != 0) {
//            try {
//                FastBufferedReader in = new FastBufferedReader(new FileReader(f));
//                MutableString line = new MutableString();
//                int i = 0;
//                while ((line = in.readLine(line)) != null) {
//                    incrementalId++;
//
//                    if (line==null || line.trim().equals(""))
//                        continue;
//
//                    String[] fields = line.toString().split("\t");
////                    String surfaceForm = fields[0];
//                    String uri = fields[1];
////                    String context = fields[2];
////                    String offset = fields[3];
////                    String type = fields[4];
//
//                    //Tuple5<String,String,String,String,String> t = new Tuple5<String,String,String,String,String>(surfaceForm, uri, context, offset, type);
//
//                    if ( !uri.equals(currentItem)){
//                        if (i >= mMinNumberOfExamples) {
//                            uniformSplit(items);
//                        } // else ignore
//                        //reset current item
//                        currentItem = uri;
//                        items = new ArrayList<String>();
//                    }
//                    items.add(line.toString());
//                    i++;
//                }
//                in.close();
//
//            } catch (IOException e) {e.printStackTrace();}
//        }
//        LOG.info("Done. Loaded "+items.size()+" items.");
//
//    }






    public void write(int id, String item, Writer writer) throws IOException {
        StringBuffer sb = new StringBuffer();
        sb.append(id);
        sb.append("\t");
        sb.append(item);
        sb.append("\n");
        writer.write(sb.toString());
    }


    public static class BySize extends DatasetSplitter {


        int mMinNumberOfExamples = 1;
        double mPercentSplit = 0.5;


        public BySize(File trainingSetFile, File testSetFile, int minNumberOfExamples, double percentSplit) throws IOException {
            super(trainingSetFile, testSetFile);
            this.mMinNumberOfExamples = minNumberOfExamples;
            this.mPercentSplit = percentSplit;
        }


        @Override
        public boolean shouldKeepTheseOccurrences(List<String> items) {
            return items.size() >= mMinNumberOfExamples;
        }


        @Override
        public void split(List<String> items) throws IOException {
            int i = incrementalId-items.size(); // set
            int n = (new Double(items.size() * mPercentSplit)).intValue();
            for (String item: items) {
                if((n>0) && // When there are enough items for dividing in training and testing
                        (i % (items.size() / n) == 0)){  // For a 10% split, uniformSplit every 10th entry
                    LOG.trace("Writing to test: "+i+" "+items.size()+"/"+ n );
                    write(i, item, mTestSetWriter);
                } else {
                    // For a 10% split, it will write to training 90% of the times, plus
                    // when there are not enough examples to split between training and testing
                    // That should assure that all senses are in training to be picked.
                    LOG.trace("Writing to training: "+i);
                    write(i, item, mTrainingSetWriter);
                }


                i++;
            }
        }


    }


    public static class BySurfaceForm extends BySize {


        Set<String> mValidSurfaceForms = new HashSet<String>();


        public BySurfaceForm(File trainingSetFile, File testSetFile, int minNumberOfExamples, double percentSplit, Set<String> validSurfaceForms) throws IOException {
            super(trainingSetFile, testSetFile, minNumberOfExamples, percentSplit);
            mValidSurfaceForms = validSurfaceForms;
            LOG.info("Assuming "+validSurfaceForms.size()+" valid surface forms to acquire occurrence samples.");
        }


        @Override
        public boolean shouldKeepTheseOccurrences(List<String> items) {
            boolean shouldKeep = false;
            for (String item: items) {
                StringBuffer sf = new StringBuffer();
                try {
                    String[] fields = item.split("\t");
                    if (fields.length >= 5) {
                        sf = sf.append(fields[2]);
                    }
                    else {
                        sf = sf.append(fields[1]);
                    }
                }
                catch (ArrayIndexOutOfBoundsException e) {
                    LOG.debug("Error parsing line: "+item);
                }
                for (String validSf: mValidSurfaceForms) {
                    //if (sf.toString().toLowerCase().contains(validSf.toLowerCase())) { // relaxed
                    if (sf.toString().toLowerCase().equals(validSf.toLowerCase())) {   // strict
                        shouldKeep = true;
                        LOG.trace("Kept:"+sf+" because it matches "+validSf);
                        break;
                    }
                }
            }
            return shouldKeep;
        }


    }




    /**
     * TODO created by Max: this functions allows for one call to create "confusable-with" sets
     * For a given type, goes through the data set that keeps the types for each resource.
     * If the type matches, look in the surrogate index for this URI (opposite direction as usually)
     * for all surface forms that can relate to this URI.
     * Return all surface forms found this way. 
     */
    public static Set<String> getConfusableSurfaceForms(String targetType, File instancesFile, LuceneCandidateSearcher surrogateSearcher) throws IOException, ParseException {
        System.err.println("Getting all surface forms for "+targetType+"s...");
        Set<String> surfaceForms = new HashSet<String>();
        if (!targetType.startsWith(SpotlightConfiguration.DEFAULT_ONTOLOGY_PREFIX))
            targetType = SpotlightConfiguration.DEFAULT_ONTOLOGY_PREFIX+ targetType;
        NxParser parser = new NxParser(new FileInputStream(instancesFile));
        while (parser.hasNext()) {
            Node[] triple = parser.next();
            if (triple[2].toString().equals(targetType)) {
                String targetUri = triple[0].toString().replace(SpotlightConfiguration.DEFAULT_NAMESPACE, "");
                try {
                    Set<SurfaceForm> surfaceFormsForURI = surrogateSearcher.getSurfaceForms(new DBpediaResource(targetUri));
                    for (SurfaceForm sf : surfaceFormsForURI) {
                        surfaceForms.add(sf.name());    
                    }
                }
                catch (SearchException e) {
                    System.err.println("URI "+targetUri+" not found in surrogate index. Skipping.");
                }
            }
        }


        return surfaceForms;
    }






    //TODO Make this guy parameterizable from command line.
    public static void main(String[] args) throws IOException, ParseException {
        /**
         * Split dataset in training and test.
         * percentageSplit indicates how much to save for testing
         * minSize indicates the minimum number of occurrences a URI has to have for it to make it to training/testing
         */
        int minSize = 2;
        double percentageSplit = 0.5;
        String targetType = "Actor"; //"Person";  //Place  //Organisation
        /*
        Here I'm using wikipediaOccurrences.ambiguous.tsv.gz
        Be careful here. Do not use withtype because it is a join with the types,
        so for URIs that have multiple types the same entry is repeated multiple times.
         */


        System.err.println("Making confusable with "+targetType+" data sets.");


        File inputFile = new File("data/WikipediaOccurrences-IDs-clean_enwiki-20100312.uriSorted.tsv");
        File trainingFile = new File("E:/dbpa/data/Person_newSurrogates/wikipediaTraining."+(new Double((1-percentageSplit)*100)).intValue()+"."+targetType+".amb.tsv");
        File testFile = new File("E:/dbpa/data/Person_newSurrogates/wikipediaTest."+(new Double(percentageSplit*100)).intValue()+"."+targetType+".amb.tsv");


        // using the next few lines, to create "confusable-with", split in training and testing
        File instancesFile = new File("data/dbpedia/instance_types_en.nt");
        File surrogateIndexDir = new File("data/SurrogateIndex.TitRedDisOcc.lowerCase");
        LuceneManager manager = new LuceneManager.CaseInsensitiveSurfaceForms(FSDirectory.open(surrogateIndexDir));
        LuceneCandidateSearcher surrogateSearcher = new LuceneCandidateSearcher(manager, false);
        Set<String> surfaceForms = getConfusableSurfaceForms(targetType, instancesFile, surrogateSearcher);


        DatasetSplitter splitter = new BySurfaceForm(trainingFile, testFile, minSize, percentageSplit, surfaceForms);
        //DatasetSplitter splitter = new BySize(trainingFile, testFile, minSize, percentageSplit);


        splitter.run(new FileInputStream(inputFile));
        //splitter.run(new GZIPInputStream(new FileInputStream(inputFile)));


    }




}
Source Code of org.dbpedia.spotlight.io.DatasetSplitter$BySurfaceForm

Related Classes of org.dbpedia.spotlight.io.DatasetSplitter$BySurfaceForm