Package edu.stanford.nlp.util

Examples of edu.stanford.nlp.util.Timing


    if (op.trainOptions().featureFrequencyCutoff > 1) {
      featureFrequencies = new IntCounter<String>();
    }

    for (int iteration = 1; iteration <= op.trainOptions.trainingIterations; ++iteration) {
      Timing trainingTimer = new Timing();
      int numCorrect = 0;
      int numWrong = 0;
      Collections.shuffle(indices, random);
      for (int start = 0; start < indices.size(); start += op.trainOptions.batchSize) {
        int end = Math.min(start + op.trainOptions.batchSize, indices.size());
        Triple<List<Update>, Integer, Integer> result = trainBatch(indices.subList(start, end), binarizedTrees, transitionLists, updates, oracle, wrapper);

        numCorrect += result.second;
        numWrong += result.third;

        for (Update update : result.first) {
          for (String feature : update.features) {
            if (allowedFeatures != null && !allowedFeatures.contains(feature)) {
              continue;
            }
            Weight weights = featureWeights.get(feature);
            if (weights == null) {
              weights = new Weight();
              featureWeights.put(feature, weights);
            }
            weights.updateWeight(update.goldTransition, update.delta);
            weights.updateWeight(update.predictedTransition, -update.delta);

            if (featureFrequencies != null) {
              featureFrequencies.incrementCount(feature, (update.goldTransition >= 0 && update.predictedTransition >= 0) ? 2 : 1);
            }
          }
        }
        updates.clear();
      }
      trainingTimer.done("Iteration " + iteration);
      System.err.println("While training, got " + numCorrect + " transitions correct and " + numWrong + " transitions wrong");
      outputStats();


      double labelF1 = 0.0;
View Full Code Here


   @return The labeled precision/recall F<sub>1</sub> (EVALB measure)
   *          of the parser on the treebank.
   */
  public double testOnTreebank(Treebank testTreebank) {
    System.err.println("Testing on treebank");
    Timing treebankTotalTimer = new Timing();
    TreePrint treePrint = op.testOptions.treePrint(op.tlpParams);
    TreebankLangParserParams tlpParams = op.tlpParams;
    TreebankLanguagePack tlp = op.langpack();
    PrintWriter pwOut, pwErr;
    if (op.testOptions.quietEvaluation) {
      NullOutputStream quiet = new NullOutputStream();
      pwOut = tlpParams.pw(quiet);
      pwErr = tlpParams.pw(quiet);
    } else {
      pwOut = tlpParams.pw();
      pwErr = tlpParams.pw(System.err);
    }
    if (op.testOptions.verbose) {
      pwErr.print("Testing ");
      pwErr.println(testTreebank.textualSummary(tlp));
    }
    if (op.testOptions.evalb) {
      EvalbFormatWriter.initEVALBfiles(tlpParams);
    }

    PrintWriter pwFileOut = null;
    if (op.testOptions.writeOutputFiles) {
      String fname = op.testOptions.outputFilesPrefix + "." + op.testOptions.outputFilesExtension;
      try {
        pwFileOut = op.tlpParams.pw(new FileOutputStream(fname));
      } catch (IOException ioe) {
        ioe.printStackTrace();
      }
    }

    PrintWriter pwStats = null;
    if(op.testOptions.outputkBestEquivocation != null) {
      try {
        pwStats = op.tlpParams.pw(new FileOutputStream(op.testOptions.outputkBestEquivocation));
      } catch(IOException ioe) {
        ioe.printStackTrace();
      }
    }

    if (op.testOptions.testingThreads != 1) {
      MulticoreWrapper<List<? extends HasWord>, ParserQuery> wrapper = new MulticoreWrapper<List<? extends HasWord>, ParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));

      LinkedList<Tree> goldTrees = new LinkedList<Tree>();
      for (Tree goldTree : testTreebank) {
        List<? extends HasWord> sentence = getInputSentence(goldTree);
        goldTrees.add(goldTree);

        pwErr.println("Parsing [len. " + sentence.size() + "]: " + Sentence.listToString(sentence));
        wrapper.put(sentence);
        while (wrapper.peek()) {
          ParserQuery pq = wrapper.poll();
          goldTree = goldTrees.poll();
          processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
        }
      } // for tree iterator
      wrapper.join();
      while (wrapper.peek()) {
        ParserQuery pq = wrapper.poll();
        Tree goldTree = goldTrees.poll();
        processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
      }
    } else {
      ParserQuery pq = pqFactory.parserQuery();

      for (Tree goldTree : testTreebank) {
        final List<CoreLabel> sentence = getInputSentence(goldTree);

        pwErr.println("Parsing [len. " + sentence.size() + "]: " + Sentence.listToString(sentence));

        pq.parseAndReport(sentence, pwErr);

        processResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
      } // for tree iterator
    }

    //Done parsing...print the results of the evaluations
    treebankTotalTimer.done("Testing on treebank");
    if (op.testOptions.quietEvaluation) {
      pwErr = tlpParams.pw(System.err);
    }
    if (saidMemMessage) {
      ParserUtils.printOutOfMemory(pwErr);
View Full Code Here

    }
  }

  protected static LexicalizedParser getParserFromTextFile(String textFileOrUrl, Options op) {
    try {
      Timing tim = new Timing();
      System.err.print("Loading parser from text file " + textFileOrUrl + ' ');
      BufferedReader in = IOUtils.readerFromString(textFileOrUrl);
      Timing.startTime();

      String line = in.readLine();
      confirmBeginBlock(textFileOrUrl, line);
      op.readData(in);
      System.err.print(".");

      line = in.readLine();
      confirmBeginBlock(textFileOrUrl, line);
      Index<String> stateIndex = HashIndex.loadFromReader(in);
      System.err.print(".");

      line = in.readLine();
      confirmBeginBlock(textFileOrUrl, line);
      Index<String> wordIndex = HashIndex.loadFromReader(in);
      System.err.print(".");

      line = in.readLine();
      confirmBeginBlock(textFileOrUrl, line);
      Index<String> tagIndex = HashIndex.loadFromReader(in);
      System.err.print(".");

      line = in.readLine();
      confirmBeginBlock(textFileOrUrl, line);
      Lexicon lex = op.tlpParams.lex(op, wordIndex, tagIndex);
      String uwmClazz = line.split(" +")[2];
      if (!uwmClazz.equals("null")) {
        UnknownWordModel model = ReflectionLoading.loadByReflection(uwmClazz, op, lex, wordIndex, tagIndex);
        lex.setUnknownWordModel(model);
      }
      lex.readData(in);
      System.err.print(".");

      line = in.readLine();
      confirmBeginBlock(textFileOrUrl, line);
      UnaryGrammar ug = new UnaryGrammar(stateIndex);
      ug.readData(in);
      System.err.print(".");

      line = in.readLine();
      confirmBeginBlock(textFileOrUrl, line);
      BinaryGrammar bg = new BinaryGrammar(stateIndex);
      bg.readData(in);
      System.err.print(".");

      line = in.readLine();
      confirmBeginBlock(textFileOrUrl, line);
      DependencyGrammar dg = new MLEDependencyGrammar(op.tlpParams, op.directional, op.distance, op.coarseDistance, op.trainOptions.basicCategoryTagsInDependencyGrammar, op, wordIndex, tagIndex);
      dg.readData(in);
      System.err.print(".");

      in.close();
      System.err.println(" done [" + tim.toSecondsString() + " sec].");
      return new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op);
    } catch (IOException e) {
      e.printStackTrace();
    }
    return null;
View Full Code Here

  }


  public static LexicalizedParser getParserFromSerializedFile(String serializedFileOrUrl) {
    try {
      Timing tim = new Timing();
      System.err.print("Loading parser from serialized file " + serializedFileOrUrl + " ...");
      ObjectInputStream in = IOUtils.readStreamFromString(serializedFileOrUrl);
      LexicalizedParser pd = loadModel(in);

      in.close();
      System.err.println(" done [" + tim.toSecondsString() + " sec].");
      return pd;
    } catch (InvalidClassException ice) {
      // For this, it's not a good idea to continue and try it as a text file!
      System.err.println();   // as in middle of line from above message
      throw new RuntimeException("Invalid class in file: " + serializedFileOrUrl, ice);
View Full Code Here

   @throws IOException If I/O errors
   *  @throws ClassNotFoundException If serialization errors
   */
  protected static TaggerConfig readModelAndInit(TaggerConfig config, DataInputStream rf,
                                         boolean printLoading) throws IOException, ClassNotFoundException {
    Timing t = new Timing();
    if (printLoading) t.doing("Reading POS tagger model from " + config.getModel());
    // then init tagger
    init(config);
    TaggerConfig ret = TaggerConfig.readConfig(rf); // taggerconfig in file has already been put into config in constructor of TaggerConfig, so usually just read past it.

    xSize = rf.readInt();
    ySize = rf.readInt();
    dict.read(rf);

    if (VERBOSE) {
      System.err.println(" dictionary read ");
    }
    tags.read(rf);
    System.out.println(tags.index);
    readExtractors(rf);
    dict.setAmbClasses();

    int[] numFA = new int[extractors.getSize() + extractorsRare.getSize()];
    int sizeAssoc = rf.readInt();
    // init the Hash at the right size for efficiency (avoid resizing ops)
    // mg2008: sizeAssoc defines the number of keys, whereas specifying
    // sizeAssoc as argument defines an initial size.
    // Unless load factor is >= 1, fAssociations is guaranteed to resize at least once.
    //fAssociations = new HashMap<FeatureKey,Integer>(sizeAssoc);
    fAssociations = new HashMap<FeatureKey,Integer>(sizeAssoc*2);
    if (VERBOSE) System.err.printf("Reading %d feature keys...\n",sizeAssoc);
    PrintFile pfVP = null;
    if (VERBOSE) {
      pfVP = new PrintFile("pairs.txt");
    }
    for (int i = 0; i < sizeAssoc; i++) {
      int numF = rf.readInt();
      FeatureKey fK = new FeatureKey();
      fK.read(rf);
      numFA[fK.num]++;
      fAssociations.put(fK, numF);
    }
    if (VERBOSE) {
      pfVP.close();
    }
    if (VERBOSE) {
      for (int k = 0; k < numFA.length; k++) {
        System.err.println(" Number of features of kind " + k + ' ' + numFA[k]);
      }
    }
    prob = new LambdaSolveTagger(rf);
    if (VERBOSE) {
      System.err.println(" prob read ");
    }
    if (printLoading) t.done();
    return ret;
  }
View Full Code Here

    }

    try {
      GlobalHolder.readModelAndInit(config, config.getModel(), true);

      Timing t = new Timing();
      TestClassifier tC = new TestClassifier(config);
      long millis = t.stop();
      printErrWordsPerSec(millis, tC.getNumWords());
      tC.printModelAndAccuracy(config);
    } catch (Exception e) {
      System.err.println("An error occured while testing the tagger.");
      e.printStackTrace();
View Full Code Here

  private static void runTraining(TaggerConfig config) {
    Date now = new Date();

    System.err.println("## tagger training invoked at " + now + " with arguments:");
    config.dump();
    Timing tim = new Timing();
    try {
      PrintFile log = new PrintFile(config.getModel() + ".props");
      log.println("## tagger training invoked at " + now + " with arguments:");
      config.dump(log);
      log.close();

      TestClassifier.trainAndSaveModel(config);
      tim.done("Training POS tagger");
    } catch(Exception e) {
      System.err.println("An error occurred while training a new tagger.");
      e.printStackTrace();
    }
  }
View Full Code Here

    }
    BufferedWriter writer = null;
    try {
      MaxentTagger tagger = new MaxentTagger(config.getModel(), config);

      Timing t = new Timing();
      String sentenceDelimiter = null;
      final TokenizerFactory<? extends HasWord> tokenizerFactory; // initialized immediately below
      if (config.getTokenize() && config.getTokenizerFactory().trim().length() != 0) {
        Class<TokenizerFactory<? extends HasWord>> clazz = (Class<TokenizerFactory<? extends HasWord>>) Class.forName(config.getTokenizerFactory().trim());
        Method factoryMethod = clazz.getMethod("newTokenizerFactory");
        tokenizerFactory = (TokenizerFactory<? extends HasWord>) factoryMethod.invoke(null);
      } else if (config.getTokenize()){
        tokenizerFactory = PTBTokenizerFactory.newWordTokenizerFactory(config.getTokenizerOptions());
      } else {
        tokenizerFactory = WhitespaceTokenizer.factory();
        sentenceDelimiter = "\n";
      }
      final DocumentPreprocessor docProcessor = new DocumentPreprocessor(tokenizerFactory);
      docProcessor.setEncoding(config.getEncoding());

      //Counts
      int numWords = 0;
      int numSentences = 0;
      String outFile = config.getOutputFile();

      if (outFile.length() > 0) {
        writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile), config.getEncoding()));
      } else {
        writer = new BufferedWriter(new OutputStreamWriter(System.out, config.getEncoding()));
      }

      String[] xmlInput = config.getXMLInput();
      if (xmlInput.length > 0) {
        if(xmlInput.length > 1 || !xmlInput[0].equals("null")) {
          tagFromXML(config);
          return;
        }
      }
      boolean stdin = config.getFile().trim().equalsIgnoreCase("stdin");

      while (true) {
        //Now determine if we're tagging from stdin or from a file
        BufferedReader br;
        if (!stdin) {
          br = IOUtils.readReaderFromString(config.getFile(), config.getEncoding());
        } else {
          System.err.println("Type some text to tag, then EOF.");
          System.err.println("  (For EOF, use Return, Ctrl-D on Unix; Enter, Ctrl-Z, Enter on Windows.)");
          br = new BufferedReader(new InputStreamReader(System.in));
        }

        int outputStyle = PlainTextDocumentReaderAndWriter.asIntOutputFormat(config.getOutputFormat());
        if (config.getSGML()) {
          // this uses NER codebase technology to read/write SGML-ish files
          PlainTextDocumentReaderAndWriter readerAndWriter = new PlainTextDocumentReaderAndWriter();
          ObjectBank<List<CoreLabel>> ob = new ObjectBank<List<CoreLabel>>(new ReaderIteratorFactory(br), readerAndWriter);
          PrintWriter pw = new PrintWriter(writer);
          for (List<CoreLabel> sentence : ob) {
            Sentence<CoreLabel> s = new Sentence<CoreLabel>(sentence);
            numWords += s.length();
            Sentence<TaggedWord> taggedSentence = MaxentTagger.tagSentence(s);
            Iterator<CoreLabel> origIter = sentence.iterator();
            for (TaggedWord tw : taggedSentence) {
              CoreLabel cl = origIter.next();
              cl.set(CoreAnnotations.AnswerAnnotation.class, tw.tag());
            }
            readerAndWriter.printAnswers(sentence, pw, outputStyle, true);
          }
        } else {
          //Now we do everything through the doc preprocessor
          List<List<? extends HasWord>> document;
          if ((config.getTagInside() != null && !config.getTagInside().equals(""))) {
            document = docProcessor.getSentencesFromXML(br, config.getTagInside(), null, false);
          } else if (stdin) {
            document = docProcessor.getSentencesFromText(new StringReader(br.readLine()));
          } else {
            document = docProcessor.getSentencesFromText(br, sentenceDelimiter);
          }

          for (List<? extends HasWord> sentence : document) {
            numWords += sentence.size();
            Sentence<TaggedWord> taggedSentence = MaxentTagger.tagSentence(sentence);

            if (outputStyle == PlainTextDocumentReaderAndWriter.OUTPUT_STYLE_TSV) {
              writer.write(getTsvWords(taggedSentence));
            } else if (outputStyle == PlainTextDocumentReaderAndWriter.OUTPUT_STYLE_XML) {
              writeXMLSentence(writer, taggedSentence, numSentences);
            } else { // if (outputStyle == PlainTextDocumentReaderAndWriter.OUTPUT_STYLE_SLASH_TAGS) {
              writer.write(taggedSentence.toString(false));
              writer.newLine();
            }
            if (stdin) {
              writer.newLine();
              writer.flush();
            }
            numSentences++;
          }
        }
        if (!stdin) break;
      }
      long millis = t.stop();
      printErrWordsPerSec(millis, numWords);
    } catch (Exception e) {
      System.err.println("An error occurred while tagging.");
      e.printStackTrace();
    } finally {
View Full Code Here

    }
    return numTokens;
  }

  private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException {
    Timing t = new Timing();
    int numTokens = 0;
    int sz = inputFileList.size();
    if (sz == 0) {
      Reader r = new InputStreamReader(System.in, charset);
      PrintWriter out = new PrintWriter(System.out, true);
      numTokens = ptb2Text(r, out);
    } else {
      for (int j = 0; j < sz; j++) {
        Reader r = IOUtils.readReaderFromString(inputFileList.get(j), charset);
        PrintWriter out;
        if (outputFileList == null) {
          out = new PrintWriter(System.out, true);
        } else {
          out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true);
        }
        numTokens += ptb2Text(r, out);
        out.close();
      }
    }
    long millis = t.stop();
    double wordspersec = numTokens / (((double) millis) / 1000);
    NumberFormat nf = new DecimalFormat("0.00"); // easier way!
    System.err.println("PTBTokenizer untokenized " + numTokens + " tokens at " +
                       nf.format(wordspersec) + " tokens per second.");
  }
View Full Code Here

    return ptb2Text(words);
  }


  private static void tok(List<String> inputFileList, List<String> outputFileList, String charset, Pattern parseInsideBegin, Pattern parseInsideEnd, String options, boolean preserveLines, boolean dump) throws IOException {
    Timing t = new Timing();
    int numTokens = 0;
    int sz = inputFileList.size();
    if (sz == 0) {
      Reader r = new InputStreamReader(System.in, charset);
      PrintWriter out = new PrintWriter(System.out, true);
      numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, options, preserveLines, dump);
    } else {
      for (int j = 0; j < sz; j++) {
        Reader r = IOUtils.readReaderFromString(inputFileList.get(j), charset);
        PrintWriter out;
        if (outputFileList == null) {
          out = new PrintWriter(System.out, true);
        } else {
          out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)), true);
        }

        numTokens += tokReader(r, out, parseInsideBegin, parseInsideEnd, options, preserveLines, dump);
        r.close();
        if (outputFileList != null) out.close();
      } // end for j going through inputFileList
    }
    long millis = t.stop();
    double wordspersec = numTokens / (((double) millis) / 1000);
    NumberFormat nf = new DecimalFormat("0.00"); // easier way!
    System.err.println("PTBTokenizer tokenized " + numTokens + " tokens at " +
                       nf.format(wordspersec) + " tokens per second.");
  }
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.util.Timing

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.