Examples of uk.ac.cam.ch.wwmm.oscar3.flow.OscarFlow

uk.ac.cam.ch.wwmm.oscar3.recogniser.document.Token
This runs Oscar modules on SciXML Documents. You put the SciXML in with the constructor, call methods to run the modules, and then use getter methods to get various Documents back. @author ptc24

    for(int i=0;i<nodes.size();i++) {
      XOMTools.removeElementPreservingText((Element)nodes.get(i));
    }
    Document safDoc = InlineToSAF.extractSAFs(doc, sourceDoc, "foo");


    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, false);


    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(sourceDoc);
    //nr.makeTokenisers(false);


    Nodes n = safDoc.query("/saf/annot[slot[@name='type']['PRW']]");


    Map<TokenSequence,Boolean> tokSeqs = new HashMap<TokenSequence,Boolean>();
    Map<TokenSequence,Bag<String>> tokSeqPRWs = new HashMap<TokenSequence,Bag<String>>();
    
    for(int i=0;i<n.size();i++) {
      Element e = (Element)n.get(i);
      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));

View Full Code Here

      for(int i=0;i<nodes.size();i++) {
        XOMTools.removeElementPreservingText((Element)nodes.get(i));
      }
      Document safDoc = InlineToSAF.extractSAFs(doc, sourceDoc, "foo");


      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, false);
      //NameRecogniser nr = new NameRecogniser();
      //nr.halfProcess(sourceDoc);
      //nr.makeTokenisers(false);
      Set<String> tokenSet = new HashSet<String>();
      Bag<String> tokenBag = new Bag<String>();
      for(TokenSequence t : procDoc.getTokenSequences()) {
        //System.out.println(t.getSourceString());
        for(Token token : t.getTokens()) {
          //tokenSet.add("stem=" + stemmer.getStem(token.getValue().toLowerCase()));
          //tokenSet.add(token.getValue().toLowerCase());
          tokenBag.add(token.getValue().toLowerCase());

View Full Code Here

      //files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/oscarworkspace/corpora/paperset1"), "source.xml");
      //files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/paperset1"), "source.xml");
      //files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/BioIE"), "source.xml");
      files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/roughPubMed"), "source.xml");
      //files = FileTools.getFilesFromDirectoryByName(new File("/scratch/pubmed/2005"), "source.xml");
      StringSource ss = new StringSource(files, false);
      
      Bag<String> wordCounts = new Bag<String>();
      
      ss.reset();
      for(String s : ss) {
        TokenSequence t = Tokeniser.getInstance().tokenise(s);
        for(String word : t.getTokenStringList()) {
          if(!word.matches(".*[a-z][a-z].*")) continue;
          word = StringTools.normaliseName(word);

View Full Code Here

    Set<String> engWords = new HashSet<String>(NGramBuilder.getInstance().engSet);
    
    List<File> files = new ArrayList<File>();
    files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/roughPubMed"), "source.xml");


    StringSource ss = new StringSource(files, false);
    
    Bag<String> wordCounts = new Bag<String>();
    
    ss.reset();
    for(String s : ss) {
      TokenSequence t = Tokeniser.getInstance().tokenise(s);
      for(String word : t.getTokenStringList()) {
        if(!word.matches(".*[a-z][a-z].*")) continue;
        word = StringTools.normaliseName(word);

View Full Code Here

        neByLastToken.put(ne.getLastToken(), ne);
      }
    }


    for(NamedEntity ne : entities) {
      Token prev = ne.getFirstToken().getNAfter(-1);
      Token next = ne.getLastToken().getNAfter(1);
      if(prev != null && next != null && prev.getValue().equals("(") && next.getValue().equals(")")) {
        Token prev2 = ne.getFirstToken().getNAfter(-2);
        if(prev2 != null) {
          String surf = ne.getSurface();
          if(surf.matches(".*[A-Z]s") || prev2.getValue().endsWith("s")) surf = surf.substring(0, surf.length()-1);
          List<String> featuresForAbbrev;
          if(abbrevFeatures.containsKey(surf)) {
            featuresForAbbrev = abbrevFeatures.get(surf);
          } else {
            featuresForAbbrev = new ArrayList<String>();

View Full Code Here

  public Document runGenia(Document sourceDoc) throws Exception {
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, true);
    Document safDoc = new Document(new Element("saf"));
    for(List<Token> sentence : procDoc.getSentences()) {
      if(sentence.size() > 0) {
        Token first = sentence.get(0);
        Token last = sentence.get(sentence.size()-1);
        Element sentenceAnnot = SafTools.makeAnnot(first.getStartXPoint(), last.getEndXPoint(),  "sentence");
        safDoc.getRootElement().appendChild(sentenceAnnot);
      }
    }
    for(TokenSequence ts : procDoc.getTokenSequences()) {
      for(Token t : ts.getTokens()) {

View Full Code Here

      if(line.matches("\\s*")) {
        writer.flush();
        line = null;
      } else {
        String [] sss = line.split("\t");
        Token t = sentence.get(index);
        t.setGeniaData(sss);
        line = bufferedReader.readLine();        
        index++;
      }
    }

View Full Code Here

    //  System.out.println();
    //}
  }
  
  private static NamedEntity makeNE(List<Token> neTokens, String neType) {
    Token firstToken = neTokens.get(0);
    Token lastToken = neTokens.get(neTokens.size()-1);
    String surf = firstToken.getTokenSequence().getSubstring(firstToken.getId(), lastToken.getId());
    return new NamedEntity(neTokens, surf, "GENIA-" + neType);
  }

View Full Code Here

  private void makeFeatures(int position) {
    List<String> local = features.get(position);
    List<String> contextable = contextableFeatures.get(position);
    List<String> bigramable = bigramableFeatures.get(position);


    Token token = tokSeq.getToken(position);
    String word = token.getValue();
    contextable.add(makeWordFeature(word));


    String normWord = StringTools.normaliseName(word);
    if (!word.equals(normWord)) {
      contextable.add(makeWordFeature(normWord));

View Full Code Here

  int truePos;
  int falsePos;
  int falseNeg;
  
  private String repForToken(TokenSequence t, int i) {
    Token token = t.getToken(i);
    String s = token.getValue();
    //if(s.equals("(")) {
    //  Token tt = token.getNAfter(1);
    //  if(tt != null && tt.getValue().equals("Found")) s += tt.getValue();
    //}
    //if(token.isCompRef()) return "$CR";

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.flow.OscarFlow

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.