Package org.languagetool.tokenizers

Examples of org.languagetool.tokenizers.Tokenizer


    } else {
      System.out.println("Checking " + file.getAbsolutePath());
      String text = StringTools.readFile(new FileInputStream(file.getAbsolutePath()));
      text = textFilter.filter(text);
      if (CHECK_BY_SENTENCE) {
        final Tokenizer sentenceTokenizer = langTool.getLanguage().getSentenceTokenizer();
        final List<String> sentences = sentenceTokenizer.tokenize(text);
        for (String sentence : sentences) {
          Tools.checkText(sentence, langTool, false, 1000);
        }
      } else {
        Tools.checkText(text, langTool);
View Full Code Here


      }
    }
  }

  private File tokenizeInput(File plainTextDictFile, Language language) throws IOException {
    Tokenizer wordTokenizer = language.getWordTokenizer();
    String encoding = getOption("fsa.dict.encoding");
    String separatorChar = hasOption("fsa.dict.separator") ? getOption("fsa.dict.separator") : "";
    File tempFile = File.createTempFile(SpellDictionaryBuilder.class.getSimpleName(), ".txt");
    try (Scanner scanner = new Scanner(plainTextDictFile, encoding)) {
      try (Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tempFile), encoding))) {
        while (scanner.hasNextLine()) {
          String line = scanner.nextLine();
          int sepPos = separatorChar.isEmpty() ? -1 : line.indexOf(separatorChar);
          String occurrences = sepPos != -1 ? line.substring(sepPos + separatorChar.length()) : "";
          String lineWithoutOcc = sepPos != -1 ? line.substring(0, sepPos) : line;
          List<String> tokens = wordTokenizer.tokenize(lineWithoutOcc);
          for (String token : tokens) {
            if (token.length() > 0) {
              out.write(token);
              if (sepPos != -1) {
                out.write(separatorChar);
View Full Code Here

    } else {
      System.out.println("Checking " + file.getAbsolutePath());
      String text = StringTools.readStream(new FileInputStream(file.getAbsolutePath()), "utf-8");
      text = textFilter.filter(text);
      if (CHECK_BY_SENTENCE) {
        final Tokenizer sentenceTokenizer = langTool.getLanguage().getSentenceTokenizer();
        final List<String> sentences = sentenceTokenizer.tokenize(text);
        for (String sentence : sentences) {
          CommandLineTools.checkText(sentence, langTool, false, 1000);
        }
      } else {
        CommandLineTools.checkText(text, langTool);
View Full Code Here

TOP

Related Classes of org.languagetool.tokenizers.Tokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.