Package uk.ac.cam.ch.wwmm.oscar3.dataparse

Examples of uk.ac.cam.ch.wwmm.oscar3.dataparse.RPNode


    Set<String> engWords = new HashSet<String>(NGramBuilder.getInstance().engSet);
   
    List<File> files = new ArrayList<File>();
    files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/roughPubMed"), "source.xml");

    StringSource ss = new StringSource(files, false);
   
    Bag<String> wordCounts = new Bag<String>();
   
    ss.reset();
    for(String s : ss) {
      TokenSequence t = Tokeniser.getInstance().tokenise(s);
      for(String word : t.getTokenStringList()) {
        if(!word.matches(".*[a-z][a-z].*")) continue;
        word = StringTools.normaliseName(word);
View Full Code Here


        neByLastToken.put(ne.getLastToken(), ne);
      }
    }

    for(NamedEntity ne : entities) {
      Token prev = ne.getFirstToken().getNAfter(-1);
      Token next = ne.getLastToken().getNAfter(1);
      if(prev != null && next != null && prev.getValue().equals("(") && next.getValue().equals(")")) {
        Token prev2 = ne.getFirstToken().getNAfter(-2);
        if(prev2 != null) {
          String surf = ne.getSurface();
          if(surf.matches(".*[A-Z]s") || prev2.getValue().endsWith("s")) surf = surf.substring(0, surf.length()-1);
          List<String> featuresForAbbrev;
          if(abbrevFeatures.containsKey(surf)) {
            featuresForAbbrev = abbrevFeatures.get(surf);
          } else {
            featuresForAbbrev = new ArrayList<String>();
View Full Code Here

  public Document runGenia(Document sourceDoc) throws Exception {
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, true);
    Document safDoc = new Document(new Element("saf"));
    for(List<Token> sentence : procDoc.getSentences()) {
      if(sentence.size() > 0) {
        Token first = sentence.get(0);
        Token last = sentence.get(sentence.size()-1);
        Element sentenceAnnot = SafTools.makeAnnot(first.getStartXPoint(), last.getEndXPoint()"sentence");
        safDoc.getRootElement().appendChild(sentenceAnnot);
      }
    }
    for(TokenSequence ts : procDoc.getTokenSequences()) {
      for(Token t : ts.getTokens()) {
View Full Code Here

      if(line.matches("\\s*")) {
        writer.flush();
        line = null;
      } else {
        String [] sss = line.split("\t");
        Token t = sentence.get(index);
        t.setGeniaData(sss);
        line = bufferedReader.readLine();       
        index++;
      }
    }

View Full Code Here

    //  System.out.println();
    //}
  }
 
  private static NamedEntity makeNE(List<Token> neTokens, String neType) {
    Token firstToken = neTokens.get(0);
    Token lastToken = neTokens.get(neTokens.size()-1);
    String surf = firstToken.getTokenSequence().getSubstring(firstToken.getId(), lastToken.getId());
    return new NamedEntity(neTokens, surf, "GENIA-" + neType);
  }
View Full Code Here

  private void makeFeatures(int position) {
    List<String> local = features.get(position);
    List<String> contextable = contextableFeatures.get(position);
    List<String> bigramable = bigramableFeatures.get(position);

    Token token = tokSeq.getToken(position);
    String word = token.getValue();
    contextable.add(makeWordFeature(word));

    String normWord = StringTools.normaliseName(word);
    if (!word.equals(normWord)) {
      contextable.add(makeWordFeature(normWord));
View Full Code Here

  int truePos;
  int falsePos;
  int falseNeg;
 
  private String repForToken(TokenSequence t, int i) {
    Token token = t.getToken(i);
    String s = token.getValue();
    //if(s.equals("(")) {
    //  Token tt = token.getNAfter(1);
    //  if(tt != null && tt.getValue().equals("Found")) s += tt.getValue();
    //}
    //if(token.isCompRef()) return "$CR";
View Full Code Here

    String tag = bestLast;
    for(int i=t.size()-1;i>=0;i--) {
      tags.set(i, tag);
      tag = prevsByLabel.get(i).get(tag);
    }
    Token startToken = null;
    for(int i=0;i<t.size();i++) {
      //System.out.println(t.getToken(i).getValue() + "\t" + tags.get(i));
      if(startToken != null && !tags.get(i).matches("[IE]-ONT.*")) {
        //System.out.println(t.getSubstring(startToken.getId(), t.getToken(i-1).getId()));
        results.add("[" + startToken.getStart() + ":" + t.getToken(i-1).getEnd() + "]");
        startToken = null;
      }
      if(tags.get(i).equals("B-ONT")) startToken = t.getToken(i);
    }
    if(startToken != null) {
      results.add("[" + startToken.getStart() + ":" + t.getToken(t.size()-1).getEnd() + "]");
      //System.out.println(t.getSubstring(startToken.getId(), t.size()-1));
    }
    return results;
  }
View Full Code Here

    Map<TokenSequence,Boolean> tokSeqs = new HashMap<TokenSequence,Boolean>();
    Map<TokenSequence,Bag<String>> tokSeqPRWs = new HashMap<TokenSequence,Bag<String>>();
   
    for(int i=0;i<n.size();i++) {
      Element e = (Element)n.get(i);
      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));
        if(tokSeqs.containsKey(tokSeq)) {
          if(isReact) tokSeqs.put(tokSeq, true);
        } else {
          tokSeqs.put(tokSeq, isReact);
View Full Code Here

    }
   
    //if(surface.matches("([Pp]oly).+")) features.add("polymer");
    //if(surface.matches(".+\\(\\d\\d\\d+\\)")) features.add("surfacenotation");
   
    Token t = state.procDoc.getTokenByStart(annot.getAttributeValue("from"));
    if(fPrevious && t != null) {
      Token tt = t.getNAfter(-1);
      if(tt != null) {
        for(int i=1;i<=1;i++) {
          if(TokenTypes.isRef(tt) && tt.getNAfter(-1) != null) {
            //  features.add(prefix + "skiprefprev");
            tt = tt.getNAfter(-1);
          }
          String ttv = tt.getValue();
          ttv = ttv.replaceAll("\\s+", "_");
          //if(i == 1) features.add("prev" + 1 + "=" + ttv);
          features.add("pbg" + i + "=" + ttv + "_" + surface.replaceAll("\\s+", "_"));
          //features.add("pbg" + (i+1) + "=" + ttv + "_" + surface.replaceAll("\\s+", "_"));
          //features.add("pbg" + 0 + "=" + ttv + "_" + surface.replaceAll("\\s+", "_"));
          //features.add("uibg" + 0 + "=" + ttv + "_" + surface.replaceAll("\\s+", "_"));
         
          tt = tt.getNAfter(-1);
          if(tt == null) break;
        }       

       
        //if(ttv.length() > 4) features.add("prevs=" + ttv.substring(ttv.length()-4) + "_" + surface.replaceAll("\\s+", "_"));
        //features.add("psbg=" + ttv + "_" + suffix.replaceAll("\\s+", "_"));
        //features.add("pstbg=" + ttv + "_" + stem);
       
      }
    }
   
    t = state.procDoc.getTokenByEnd(annot.getAttributeValue("to"));
    if(fNext && t != null) {
      Token tt = t.getNAfter(1);
      //if(tt != null && tt.getValue().equals("-")) {
      //  tt = t.getNAfter(2);
      //}
      if(tt != null) {
        for(int i=1;i<=1;i++) {
          if(TokenTypes.isRef(tt) && tt.getNAfter(1) != null) {
            tt = tt.getNAfter(1);
          }
         
          String ttv = tt.getValue();
          ttv = ttv.replaceAll("\\s+", "_");
          //if(i == 1) features.add("next=" + ttv);
          features.add("nbg" + i + "=" + surface.replaceAll("\\s+", "_"+ "_" + ttv);
         
          //features.add("uibg" + 0 + "=" + ttv + "_" + surface.replaceAll("\\s+", "_"));

         
          tt = tt.getNAfter(1);
          if(tt == null) break;
        }
        //if(ttv.length() > 4) features.add("nexts=" + surface.replaceAll("\\s+", "_")  + "_" + ttv.substring(ttv.length()-4));
        //features.add("nsbg=" + suffix.replaceAll("\\s+", "_")  + "_" + ttv);
        //features.add("nstbg=" + stem  + "_" + ttv);
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.dataparse.RPNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.