Package org.languagetool.dev.eval

Source Code of org.languagetool.dev.eval.CheckBNC$BNCTextFilter

/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
* USA
*/

package org.languagetool.dev.eval;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;

import org.languagetool.JLanguageTool;
import org.languagetool.commandline.CommandLineTools;
import org.languagetool.language.English;
import org.languagetool.tokenizers.Tokenizer;
import org.languagetool.tools.StringTools;

/**
* Uses JLanguageTol recursively on the files of the BNC (British National Corpus).
*
* @author Daniel Naber
*/
public final class CheckBNC {

  private JLanguageTool langTool = null;
  private final BNCTextFilter textFilter = new BNCTextFilter();

  private static final boolean CHECK_BY_SENTENCE = true;

  public static void main(String[] args) throws Exception {
    if (args.length != 1) {
      System.out.println("Usage: CheckBNC <directory>");
      System.exit(1);
    }
    final CheckBNC prg = new CheckBNC();
    prg.run(new File(args[0]));
  }
 
  private CheckBNC() throws IOException {
    langTool = new JLanguageTool(new English());
    langTool.activateDefaultPatternRules();
    final String[] disRules = {"UPPERCASE_SENTENCE_START", "COMMA_PARENTHESIS_WHITESPACE",
        "WORD_REPEAT_RULE", "DOUBLE_PUNCTUATION"};
    System.err.println("Note: disabling the following rules:");
    for (String disRule : disRules) {
      langTool.disableRule(disRule);
      System.err.println(" " + disRule);
    }
  }

  private void run(final File file) throws IOException {
    if (file.isDirectory()) {
      final File[] files = file.listFiles();
      for (File file1 : files) {
        run(new File(file, file1.getName()));
      }
    } else {
      System.out.println("Checking " + file.getAbsolutePath());
      String text = StringTools.readStream(new FileInputStream(file.getAbsolutePath()), "utf-8");
      text = textFilter.filter(text);
      if (CHECK_BY_SENTENCE) {
        final Tokenizer sentenceTokenizer = langTool.getLanguage().getSentenceTokenizer();
        final List<String> sentences = sentenceTokenizer.tokenize(text);
        for (String sentence : sentences) {
          CommandLineTools.checkText(sentence, langTool, false, 1000);
        }
      } else {
        CommandLineTools.checkText(text, langTool);
      }
    }
  }

  class BNCTextFilter {

    public String filter(String text) {
      String fText = text.replaceAll("(?s)<header.*?>.*?</header>", "");
      fText = fText.replaceAll("<w.*?>", "");
      fText = fText.replaceAll("<c.*?>", "");
      fText = fText.replaceAll("<.*?>", "");
      fText = fText.replaceAll(" +", " ");
      fText = fText.replaceAll("&bquo|&equo", "\"");
      fText = fText.replaceAll("&mdash;?", "--");
      fText = fText.replaceAll("&amp;?", "&");
      return fText;
    }

  }

}
TOP

Related Classes of org.languagetool.dev.eval.CheckBNC$BNCTextFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.