Examples of SRILMFactoredBundleCorpusIterator


Examples of opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator

        try {
            wbr = new BufferedWriter(new FileWriter(wd));
            pbr = new BufferedWriter(new FileWriter(posd));
            minCatFreq = catFreq;
            if (tokenisationType.equalsIgnoreCase("srilm")) {
                incorp = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(corpus)));
            } else {
                incorp = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(corpus)));
            }
        } catch (IOException ex) {
            Logger.getLogger(TaggingDictionaryExtractor.class.getName()).log(Level.SEVERE, null, ex);
View Full Code Here

Examples of opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator

            if(args[i].equals("-o")) { output = args[++i]; continue; }
            System.err.println("unknown command-line option: " + args[i]);
        }
       
        BufferedReader in = new BufferedReader(new FileReader(new File(input)));
        SRILMFactoredBundleCorpusIterator corp = new SRILMFactoredBundleCorpusIterator(in);
        BufferedWriter out = new BufferedWriter(new FileWriter(new File(output)));
       
       
        for(List<Word> sent : corp) {
            out.write("<s> ");
View Full Code Here

Examples of opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator

        if (args.length > 0 && args[0].equals("-h")) {
            System.out.println(usage);
            System.exit(0);
        }

        SRILMFactoredBundleCorpusIterator in = null;
        BufferedWriter out = null;
        try {
           
            String inputCorp = "<stdin>", output = "<stdout>",
                   configFile = null;
           
            double beta = 1.0;
            boolean test = false;

            for (int i = 0; i < args.length; i++) {
                if (args[i].equals("-i")) { inputCorp = args[++i]; continue; }
                if (args[i].equals("-o")) { output = args[++i];    continue; }
                if (args[i].equals("-e")) { test = true; continue; }
                if (args[i].equals("-c")) { configFile = args[++i]; continue; }
                if (args[i].equals("-beta")) { beta = Double.parseDouble(args[++i]); continue; }
                System.out.println("Unrecognized option: " + args[i]);
            }

            ResultSink rs = new ResultSink(ResultSink.ResultSinkType.SUPERTAG);
            try {               
                in = new SRILMFactoredBundleCorpusIterator(
                        (inputCorp.equals("<stdin>")) ?
                            new BufferedReader(new InputStreamReader(System.in)) :
                            new BufferedReader(new FileReader(new File(inputCorp))));               
            } catch (FileNotFoundException ex) {
                System.err.print("Input corpus " + inputCorp + " not found.  Exiting...");
                Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(-1);
            }

            try {
                out = (output.equals("<stdout>")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output)));
            } catch (IOException ex) {
                System.err.print("Output file " + output + " not found.  Exiting...");
                Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(-1);
            }

            WordAndPOSDictionaryLabellingStrategy stgger = WordAndPOSDictionaryLabellingStrategy.supertaggerFactory(configFile);
           
            // for each sentence, print out:
            // <s>
            // w1   <numPOSTags>    <posTag1>   ... <posTagK>   <numSupertags>  <supertag1> ... <supertagL>
            // ...
            // wN   <numPOSTags>    <posTag1>   ... <posTagM>   <numSupertags>  <supertag1> ... <supertagU>
            // </s>
            for (List<Word> inLine : in) {
               
                List<List<Pair<Double,String>>> taggedSent = stgger.multitag(inLine, beta);
                if(test) { rs.addSent(taggedSent, inLine); }
                // beginning of sentence...
                out.write("<s>" + System.getProperty("line.separator"));               
                List<TaggedWord> posTagging = stgger.getCurrentTagging();
                int cursor = -1;
                while(++cursor < taggedSent.size()) {
                    Word wdIn = inLine.get(cursor);
                    // word form...
                    out.write(wdIn.getForm());
                    TaggedWord posT = posTagging.get(cursor);
                    // print out number of POS tags, followed by tab-separated probabilized POS tagging.
                    out.write("\t" + posT.getPOSTagging().size());
                    for(Pair<Double,String> pt : posT.getPOSTagging()) {
                        out.write("\t" + pt.b + "\t" + pt.a);
                    }
                    // now print out number of and list of tab-separated, probabilized supertags.
                    out.write("\t" + taggedSent.get(cursor).size());
                    for(Pair<Double,String> stg : taggedSent.get(cursor)) {
                        out.write("\t" + stg.b + "\t" + stg.a);
                    }
                    out.write(System.getProperty("line.separator"));
                }
                out.write("</s>" + System.getProperty("line.separator"));
            }
            out.flush();

            if(test) { System.err.println(rs.report()); }
        } catch (IOException ex) {
            Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException ex) {
                Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }
View Full Code Here

Examples of opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator

             
              Iterator<List<Word>> corpus = null;
              Iterator<List<Word>> goldCorpus = null;
             
              if(options.valueOf(tokenisation).equalsIgnoreCase("srilm")) {
                  corpus = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(inputspec))));
              } else if(options.valueOf(tokenisation).equalsIgnoreCase("candc")) {
                  corpus = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(inputspec))));
              }
              if(options.has("test") && options.valueOf(tokenisation).equalsIgnoreCase("srilm")) {
                  goldCorpus = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(goldstandspec))));
              } else if(options.has("test") && options.valueOf(tokenisation).equalsIgnoreCase("candc")) {
                  goldCorpus = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(goldstandspec))));
              }
             
              BufferedWriter outf = new BufferedWriter(new FileWriter(options.valueOf(outputspec)));
View Full Code Here

Examples of opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator

            if(args[i].equals("-o")) { output = args[++i]; continue; }
            System.err.println("unknown command-line option: " + args[i]);
        }
       
        BufferedReader in = new BufferedReader(new FileReader(new File(input)));
        SRILMFactoredBundleCorpusIterator corp = new SRILMFactoredBundleCorpusIterator(in);
        BufferedWriter out = new BufferedWriter(new FileWriter(new File(output)));
       
       
        for(List<Word> sent : corp) {
            out.write("<s> ");
View Full Code Here

Examples of opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator

        if (args.length > 0 && args[0].equals("-h")) {
            System.out.println(usage);
            System.exit(0);
        }

        SRILMFactoredBundleCorpusIterator inp = null;
        BufferedWriter out = null;
       
        try {
            String inputCorp = "<stdin>", output = "<stdout>",
                   configFile = null;
           
            boolean test = false;

            for (int i = 0; i < args.length; i++) {
                if (args[i].equals("-i")) { inputCorp = args[++i]; continue; }
                if (args[i].equals("-o")) { output = args[++i];    continue; }
                if (args[i].equals("-e")) { test = true; continue; }
                if (args[i].equals("-c")) { configFile = args[++i]; continue; }
                System.out.println("Unrecognized option: " + args[i]);
            }

            ResultSink rs = new ResultSink(ResultSink.ResultSinkType.POSTAG);
           
            try {                       
                inp = new SRILMFactoredBundleCorpusIterator(
                        (inputCorp.equals("<stdin>")) ?
                            new BufferedReader(new InputStreamReader(System.in)) :
                            new BufferedReader(new FileReader(new File(inputCorp))));               
            } catch (FileNotFoundException ex) {
                System.err.print("Input corpus " + inputCorp + " not found.  Exiting...");
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(-1);
           

            try {
                out = (output.equals("<stdout>")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output)));
            } catch (IOException ex) {
                System.err.print("Output file " + output + " not found.  Exiting...");
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(-1);
            }

            POSTagger post = POSTagger.posTaggerFactory(configFile);
           
            for (List<Word> inLine : inp) {
                List<TaggedWord> taggedSent = post.tagSentence(inLine);
                List<List<Pair<Double,String>>> sentTagging = new ArrayList<List<Pair<Double,String>>>(taggedSent.size());
                for(TaggedWord tw : taggedSent) { sentTagging.add(tw.getPOSTagging()); }
                if(test) { rs.addSent(sentTagging, inLine); }
                out.write("<s>" + System.getProperty("line.separator"));
                for(TaggedWord tw : taggedSent) {
                    out.write(tw.getForm());
                    for(Pair<Double,String> tg : tw.getPOSTagging()) {
                        out.write("\t" + tg.b + "\t" + tg.a);
                    }
                    out.write(System.getProperty("line.separator"));
                }
                out.write("</s>" + System.getProperty("line.separator"));
            }
            out.flush();

            if(test) { System.err.println(rs.report()); }
        } catch(Throwable t) {
            t.printStackTrace();
        } finally {
            try {               
                inp.close();
                out.close();
            } catch (IOException ex) {
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
View Full Code Here

Examples of opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator

        if (args.length > 0 && args[0].equals("-h")) {
            System.out.println(usage);
            System.exit(0);
        }

        SRILMFactoredBundleCorpusIterator in = null;
        BufferedWriter out = null;
        BufferedWriter voc = null;

        try {
            String inputCorp = "<stdin>", output = "<stdout>", vocabFile = "vocab.voc";

            for (int i = 0; i < args.length; i++) {
                if (args[i].equals("-c")) { inputCorp = args[++i]; continue; }
                if (args[i].equals("-o")) { output = args[++i];    continue; }
                if (args[i].equals("-vocab")) { vocabFile = args[++i]continue; }
                System.out.println("Unrecognized option: " + args[i]);
            }

            try {
                in = new SRILMFactoredBundleCorpusIterator(
                        (inputCorp.equals("<stdin>")) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new FileReader(new File(inputCorp))));
            } catch (FileNotFoundException ex) {
                System.err.print("Input corpus " + inputCorp + " not found.  Exiting...");
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(
                        -1);
            }

            try {
                out = (output.equals("<stdout>")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output)));
            } catch (IOException ex) {
                System.err.print("Output file " + output + " not found.  Exiting...");
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(
                        -1);
            }

            try {
                voc = new BufferedWriter(new FileWriter(new File(vocabFile)));
            } catch (IOException ex) {
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
            }

            Map<String, Integer> vocab = new HashMap<String, Integer>();
            for (List<Word> inLine : in) {
                for (Word w : inLine) {
                    String pos = POS_TAG + "-" + DefaultTokenizer.escape(w.getPOS()),
                        wform = WORD + "-" + DefaultTokenizer.escape(w.getForm());

                    vocab.put(pos, (vocab.get(pos) == null) ? 1 : vocab.get(pos) + 1);
                    vocab.put(wform, (vocab.get(wform) == null) ? 1 : vocab.get(wform) + 1);
                    out.write(wform + ":" + pos + " ");
                }
                out.write(System.getProperty("line.separator"));
            }
            out.flush();

            for (String str : vocab.keySet()) {
                    voc.write(str + System.getProperty("line.separator"));
            }
            voc.flush();
        } finally {
            try {
                out.close();
                in.close();
                voc.close();
            } catch (IOException ex) {
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
View Full Code Here

Examples of opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator

        if (args.length > 0 && args[0].equals("-h")) {
            System.out.println(usage);
            System.exit(0);
        }

        SRILMFactoredBundleCorpusIterator in = null;
        BufferedWriter out = null;
        BufferedWriter voc = null;

        try {
            String inputCorp = "<stdin>", output = "<stdout>", vocabFile = "vocab.voc";
            int catCutoff = 10;

            for (int i = 0; i <
                    args.length; i++) {
                if (args[i].equals("-c")) { inputCorp = args[++i]; continue; }
                if (args[i].equals("-o")) { output = args[++i];    continue; }
                if (args[i].equals("-vocab")) {vocabFile = args[++i]; continue; }
                if (args[i].equals("-u")) { catCutoff = Integer.parseInt(args[++i]); continue; }
                System.out.println("Unrecognized option: " + args[i]);
            }
           
            try {
                in = new SRILMFactoredBundleCorpusIterator(
                        (inputCorp.equals("<stdin>")) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new FileReader(new File(inputCorp))));
            } catch (FileNotFoundException ex) {
                System.err.print("Input corpus " + inputCorp + " not found.  Exiting...");
                Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(
                        -1);
            }

            try {
                out = (output.equals("<stdout>")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output)));
            } catch (IOException ex) {
                System.err.print("Output file " + output + " not found.  Exiting...");
                Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(
                        -1);
            }

            try {
                voc = new BufferedWriter(new FileWriter(new File(vocabFile)));
            } catch (IOException ex) {
                Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex);
            }

            Map<String, Integer> vocab = new HashMap<String, Integer>();
            for (List<Word> inLine : in) {
                for (Word w : inLine) {
                    String st = SUPERTAG + "-" + DefaultTokenizer.escape(w.getSupertag()),
                            pos = POS_TAG + "-" + DefaultTokenizer.escape(w.getPOS()),
                            wform = WORD + "-" + DefaultTokenizer.escape(w.getForm());

                    vocab.put(st, (vocab.get(st) == null) ? 1 : vocab.get(st) + 1);
                    vocab.put(pos, (vocab.get(pos) == null) ? 1 : vocab.get(pos) + 1);
                    vocab.put(wform, (vocab.get(wform) == null) ? 1 : vocab.get(wform) + 1);
                }

            }

            // reopen file
            try {
                in = new SRILMFactoredBundleCorpusIterator(
                        (inputCorp.equals("<stdin>")) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new FileReader(new File(inputCorp))));
            } catch (FileNotFoundException ex) {
                System.err.print("Input corpus " + inputCorp + " not found.  Exiting...");
                Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(
                        -1);
            }
            for (List<Word> inLine : in) {
                for (Word w : inLine) {
                    String st = SUPERTAG + "-" + DefaultTokenizer.escape(w.getSupertag()),
                            pos = POS_TAG + "-" + DefaultTokenizer.escape(w.getPOS()),
                            wform = WORD + "-" + DefaultTokenizer.escape(w.getForm());
                    if (vocab.get(st) > catCutoff) {
                        out.write(wform + ":" + pos + ":" + st + " ");
                    }
                }

                out.write(System.getProperty("line.separator"));
            }

            out.flush();

            for (String str : vocab.keySet()) {
                if (vocab.get(str) > catCutoff) {
                    voc.write(str + System.getProperty("line.separator"));
                }
            }

            voc.flush();
        } finally {
            try {
                out.close();
                in.close();
                voc.close();
            } catch (IOException ex) {
                Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
View Full Code Here

Examples of opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator

    public ZhangLeTrainingExtractor(File corpus, File outputF, String tokenisation, FeatureExtractor fexer) {
        this.fexer = fexer;
        this.outputF = outputF;
        try {
            if (tokenisation.equalsIgnoreCase("srilm")) {
                incorp = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(corpus)));
            } else {
                incorp = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(corpus)));
            }
        } catch (FileNotFoundException ex) {
            Logger.getLogger(ZhangLeTrainingExtractor.class.getName()).log(Level.SEVERE, null, ex);
View Full Code Here

Examples of opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator

            if(args[j].equals("-o")) { output = args[++j]; continue; }
            if(args[j].equals("-p")) { priorModF = args[++j]; continue; }
            if(args[j].equals("-v")) { priorVocab = args[++j]; continue; }
            System.err.println("Unrecognized option: " + args[j]);
        }
        SRILMFactoredBundleCorpusIterator corp = new SRILMFactoredBundleCorpusIterator(
                input.equals("<stdin>") ?
                    new BufferedReader(new InputStreamReader(System.in)) :
                    new BufferedReader(new FileReader(new File(input))));
        BufferedWriter out = new BufferedWriter(
                output.equals("<stdout>") ?
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.