Package edu.stanford.nlp.semgraph.semgrex

Source Code of edu.stanford.nlp.semgraph.semgrex.SemgrexBatchParser

package edu.stanford.nlp.semgraph.semgrex;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;

/**
* Parses a batch of SemgrexPatterns from a stream
* Each SemgrexPattern must be defined in a single line
* This includes a preprocessor that supports macros, defined as: "macro NAME = VALUE" and used as ${NAME}
* For example:
*   # lines starting with the pound sign are skipped
*   macro JOB = president|ceo|star
*   {}=entity >appos ({lemma:/${JOB}/} >nn {ner:ORGANIZATION}=slot)
*/
public class SemgrexBatchParser {
  /** Maximum stream size in characters */
  private static final int MAX_STREAM_SIZE = 1024 * 1024;
 
  public List<SemgrexPattern> compileStream(InputStream is) throws IOException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
    reader.mark(MAX_STREAM_SIZE);
    Map<String, String> macros = preprocess(reader);
    reader.reset();
    return parse(reader, macros);
  }
 
  private List<SemgrexPattern> parse(BufferedReader reader, Map<String, String> macros) throws IOException {
    List<SemgrexPattern> patterns = new ArrayList<SemgrexPattern>();
    for(String line; (line = reader.readLine()) != null; ) {
      line = line.trim();
      if(line.length() == 0 || line.startsWith("#")) continue;
      if(line.startsWith("macro ")) continue;
      line = replaceMacros(line, macros);
      SemgrexPattern pattern = SemgrexPattern.compile(line);
      patterns.add(pattern);
    }
    return patterns;
  }
 
  private static final Pattern MACRO_NAME_PATTERN = Pattern.compile("\\$\\{[a-z0-9]+\\}", Pattern.CASE_INSENSITIVE);
 
  private String replaceMacros(String line, Map<String, String> macros) {
    StringBuffer out = new StringBuffer();
    Matcher matcher = MACRO_NAME_PATTERN.matcher(line);
    int offset = 0;
    while(matcher.find(offset)) {
      int start = matcher.start();
      int end = matcher.end();
      String name = line.substring(start + 2, end - 1);
      String value = macros.get(name);
      if(value == null){
        throw new RuntimeException("ERROR: Unknown macro \"" + name + "\"!");
      }
      if(start > offset) {
        out.append(line.substring(offset, start));
      }
      out.append(value);
      offset = end;
    }
    if(offset < line.length()) out.append(line.substring(offset));
    String postProcessed =  out.toString();
    if(! postProcessed.equals(line)) System.err.println("Line \"" + line + "\" changed to \"" + postProcessed + "\"");
    return postProcessed;
  }
 
  private Map<String, String> preprocess(BufferedReader reader) throws IOException {
    Map<String, String> macros = Generics.newHashMap();
    for(String line; (line = reader.readLine()) != null; ) {
      line = line.trim();
      if(line.startsWith("macro ")){
        Pair<String, String> macro = extractMacro(line);
        macros.put(macro.first(), macro.second());
      }
    }
    return macros;
  }
 
  private Pair<String, String> extractMacro(String line) {
    assert(line.startsWith("macro"));
    int equalPosition = line.indexOf('=');
    if(equalPosition < 0) {
      throw new RuntimeException("ERROR: Invalid syntax in macro line: \"" + line + "\"!");
    }
    String name = line.substring(5, equalPosition).trim();
    if(name.length() == 0) {
      throw new RuntimeException("ERROR: Invalid syntax in macro line: \"" + line + "\"!");
    }
    String value = line.substring(equalPosition + 1).trim();
    if(value.length() == 0) {
      throw new RuntimeException("ERROR: Invalid syntax in macro line: \"" + line + "\"!");
    }
    return new Pair<String, String>(name, value);
  }
}
TOP

Related Classes of edu.stanford.nlp.semgraph.semgrex.SemgrexBatchParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.