Examples of DocumentPreprocessor

edu.stanford.nlp.process.DocumentPreprocessor
Produces a list of sentences from either a plain text or XML document.
Tokenization: The default tokenizer is {@link PTBTokenizer}. If null is passed to setTokenizerFactory, then whitespace tokenization is assumed.
Adding a new document type requires two steps:
1. Add a new DocType.
2. Create an iterator for the new DocType and modify the iterator() function to return the new iterator.
NOTE: This implementation should not use external libraries since it is used in the parser. @author Spence Green
edu.ucla.sspace.text.DocumentPreprocessor
A class for preprocessing all types of documents. This approach was used by Rohde et al. (2004) for processing USENET articles.

Examples of edu.stanford.nlp.process.DocumentPreprocessor

    MaxentTagger tagger = new MaxentTagger(args[0]);
    TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
                     "untokenizable=noneKeep");
    BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
    documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
    for (List<HasWord> sentence : documentPreprocessor) {
      List<TaggedWord> tSentence = tagger.tagSentence(sentence);
      pw.println(Sentence.listToString(tSentence, false));
    }

View Full Code Here

Examples of edu.stanford.nlp.process.DocumentPreprocessor


    //Loop over the files
    for (int i = argIndex; i < args.length; i++) {
      final String filename = args[i];


      final DocumentPreprocessor documentPreprocessor;
      if (filename.equals("-")) {
        try {
          documentPreprocessor = new DocumentPreprocessor(new BufferedReader(new InputStreamReader(System.in, op.tlpParams.getInputEncoding())),docType);
        } catch (IOException e) {
          throw new RuntimeIOException(e);
        }
      } else {
        documentPreprocessor = new DocumentPreprocessor(filename,docType,op.tlpParams.getInputEncoding());
      }


      //Unused values are null per the main() method invocation below
      //null is the default for these properties
      documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
      documentPreprocessor.setEscaper(escaper);
      documentPreprocessor.setSentenceDelimiter(sentenceDelimiter);
      documentPreprocessor.setTagDelimiter(tagDelimiter);
      documentPreprocessor.setElementDelimiter(elementDelimiter);
      if(tokenizerFactory == null)
        documentPreprocessor.setTokenizerFactory((tokenized) ? null : tlp.getTokenizerFactory());
      else
        documentPreprocessor.setTokenizerFactory(tokenizerFactory);


      //Setup the output
      PrintWriter pwo = pwOut;
      if (op.testOptions.writeOutputFiles) {
        String normalizedName = filename;

View Full Code Here

Examples of edu.stanford.nlp.process.DocumentPreprocessor


      // Here we take the first line and tokenize it as one sentence.
      String[] lines = chunk.trim().split("\\n");
      String sentence = lines[0];
      StringReader sin = new StringReader(sentence);
      DocumentPreprocessor document = new DocumentPreprocessor(sin);
      document.setSentenceFinalPuncWords(new String[] {"\n"});
      List<HasWord> tokens = document.iterator().next();
      Integer mainLabel = new Integer(tokens.get(0).word());
      //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
      tokens = tokens.subList(1, tokens.size());
      //System.err.println(tokens);

View Full Code Here

Examples of edu.stanford.nlp.process.DocumentPreprocessor

        private final Iterator<List<HasWord>> sentenceIterator;
        private Iterator<HasWord> tokenIterator;
        boolean eos = false;


        public StandfordTokenStream(Reader reader) {
            DocumentPreprocessor dp = new DocumentPreprocessor(reader);
            sentenceIterator = dp.iterator();
        }

View Full Code Here

Examples of edu.ucla.sspace.text.DocumentPreprocessor

            System.out.println(
                    "usage: java NsfAbstractCleaner <abstract_dir> <out_file>");
            System.exit(1);
        }


        DocumentPreprocessor processor = new DocumentPreprocessor();
        PrintWriter pw = new PrintWriter(args[1]);


        File baseAbstractDir = new File(args[0]);
        // Iterate over the year directories in the main directory.
        for (File abstractYearDir : baseAbstractDir.listFiles()) {


            // Skip files that are not directories and files that do not start
            // with "awards".
            if (!abstractYearDir.isDirectory() ||
                !abstractYearDir.getName().startsWith("awards"))
                continue;


            // Each NSF award year directory is split into several
            // subdirectories, iterate over each one.
            for (File abstractPartDir : abstractYearDir .listFiles()) {


                // Skip any non directory entries, such as links.html.
                if (!abstractPartDir.isDirectory())
                    continue;


                // Iterate over each award.
                for (File awardFile : abstractPartDir.listFiles()) {
                    BufferedReader br = 
                        new BufferedReader(new FileReader(awardFile));
                    StringBuilder sb = new StringBuilder();
                    boolean startedContent = false;


                    // Scan through the posting to find the "Abstract" line.
                    // This line marks the beginning of the real abstract.
                    for (String line = null; (line = br.readLine()) != null; ) {
                        if (startedContent)
                            sb.append(line).append(" ");
                        if (line.startsWith("Abstract"))
                            startedContent = true;
                    }


                    // Clean and write the posting's content to the output file.
                    sb.append("\n");
                    String cleanedContent = processor.process(sb.toString());
                    System.out.println(awardFile.getAbsolutePath());
                    pw.printf("%s\n", cleanedContent);
                    br.close();
                }
            }

View Full Code Here

Examples of edu.ucla.sspace.text.DocumentPreprocessor

        String article = rawArticleText.toString();


        // Being removing any tokens according to the options
        if (options.contains(CleanerOption.USE_PREPROCESSOR)) {
            LOGGER.finer("applying preprocessor");
            article = new DocumentPreprocessor().process(article);
        }
        if (options.contains(CleanerOption.FILTER_TOKENS)) {
            LOGGER.finer("filtering tokens");
            article = filterTokens(article);
        }

View Full Code Here

Examples of edu.ucla.sspace.text.DocumentPreprocessor

  try {
      if (args.length != 2) {
    usage();
    return;
      }
      DocumentPreprocessor processor = new DocumentPreprocessor();
      BufferedReader br = new BufferedReader(new FileReader(args[0]));
      BufferedWriter bw = new BufferedWriter(new FileWriter(args[1]));
      for (String line = null; (line = br.readLine()) != null;) {
    String cleaned = processor.process(line);
    if (!cleaned.equals("")){
        bw.write(cleaned);
        bw.newLine();
    }
      }

View Full Code Here

Examples of edu.ucla.sspace.text.DocumentPreprocessor

            System.out.println(
                    "usage: java TwentyNewsGroupCleaner <ng_dir> <out_file>");
            System.exit(1);
        }


        DocumentPreprocessor processor = new DocumentPreprocessor();
        PrintWriter pw = new PrintWriter(args[1]);


        File baseNGDir = new File(args[0]);
        // Iterate over the newsgroup directories in the main directory.
        for (File newsGroupDir : baseNGDir.listFiles()) {


            // Skip any non-directories.
            if (!newsGroupDir.isDirectory())
                continue;


            // Iterate over the individual postings in each newsgroup.
            for (File newsGroupEntry : newsGroupDir.listFiles()) {
                BufferedReader br = 
                    new BufferedReader(new FileReader(newsGroupEntry));
                StringBuilder sb = new StringBuilder();
                boolean startedContent = false;


                // Scan through the posting to find the "Lines" line.  This line
                // marks the beginning of the real newsgroup data.
                for (String line = null; (line = br.readLine()) != null; ) {
                    if (startedContent)
                        sb.append(line).append(" ");
                    if (line.startsWith("Lines:"))
                        startedContent = true;
                }


                // Clean and write the posting's content to the output file.
                sb.append("\n");
                String cleanedContent = processor.process(sb.toString());
                System.out.println(newsGroupEntry.getAbsolutePath());
                pw.printf("%s\n", cleanedContent);
                br.close();
            }
        }

View Full Code Here

Examples of edu.ucla.sspace.text.DocumentPreprocessor

    PrintWriter writer = null;
    beginTime = begin;
    endTime = end;
    try {
      writer = new PrintWriter(outFile);
      processor = new DocumentPreprocessor(wordFile);
    } catch (FileNotFoundException fnee) {
      fnee.printStackTrace();
      System.exit(1); 
    } catch (IOException ioe) {
      ioe.printStackTrace();

View Full Code Here

0 1

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.