Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.TokenSequence

Package uk.ac.cam.ch.wwmm.oscar3.recogniser.document

Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.TokenSequence

uk.ac.cam.ch.wwmm.oscar3.recogniser.document.TokenSequenceSource
A tokenised representation of a piece of text, as made by the Tokeniser class. @author ptc24

      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        //boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));
        boolean isPubmed = f.toString().contains("pubmed");
        if(tokSeqs.containsKey(tokSeq)) {
          if(isPubmed) tokSeqs.put(tokSeq, true);
        } else {

View Full Code Here

    
    File f = new File("/home/ptc24/tmp/yahoo/" + word + ".txt");
    List<String> strings = FileTools.getStrings(new FileInputStream(f));
    Bag<String> collocs = new Bag<String>();
    for(String s : strings) {
      TokenSequence t = Tokeniser.getInstance().tokenise(s);
      List<String> tokens = t.getTokenStringList();
      boolean hasCell = false;
      for(String token : tokens) {
        if(token.toLowerCase().equals(word)) {
          hasCell = true;
          break;

View Full Code Here

        for(LatticeCell prev : cell.prev) {
          System.out.println("\t<-\t" + prev);
        }
      }*/
      
      TokenSequence tokSeq = lattice.tokSeq;
      List<Token> tokens = tokSeq.getTokens();
      List<List<Token>> sentences = SentenceSplitter.makeSentences(tokens);
      Set<String> existingRelations = new HashSet<String>();
      for(List<Token> sentence : sentences) {
        /*NewGeniaRunner.runGenia(sentence);
        List<NamedEntity> bioNEs = NewGeniaRunner.getGeniaNEs(sentence);

View Full Code Here

    
    DFARelationFinder relf = DFARelationFinder.getInstance();
    List<Lattice> lattices = Lattice.buildLattices(procDoc, safDoc.getRootElement());
    
    for(Lattice lattice : lattices) {
      TokenSequence tokSeq = lattice.tokSeq;
      List<Token> tokens = tokSeq.getTokens();
      List<List<Token>> sentences = SentenceSplitter.makeSentences(tokens);
      Set<String> existingRelations = new HashSet<String>();
      for(List<Token> sentence : sentences) {
        List<Relation> relations = relf.getRelations(sentence, "", lattice, null);
        if(relations.size() > 0) {

View Full Code Here

      if(col >= 80) {
        col = 0;
        System.out.println();
      }*/
      //System.out.println(s);
      TokenSequence t = Tokeniser.getInstance().tokenise(s);
      LinkedList<StringBuffer> prevBuffer = new LinkedList<StringBuffer>();
      for(String word : t.getTokenStringList()) {
        if(!word.matches(".*[a-zA-Z0-9].*|-|/")) {
          if(prevBuffer.size() > 0) prevBuffer.removeLast();
          for(StringBuffer sb : prevBuffer) {
            collCounts.add(sb.toString());
          }

View Full Code Here

    
    Nodes n = XMLStrings.getInstance().getChemicalPlaces(doc);
    for(int i=0;i<n.size();i++) {
      String s = n.get(i).getValue();
      if(s != null) {
        TokenSequence t = Tokeniser.getInstance().tokenise(s);
        for(String word : t.getTokenStringList()) {
          results.add(StringTools.normaliseName2(word).intern());
        }        
      }
    }

View Full Code Here

      int start = Integer.parseInt(e.getAttributeValue("xtspanstart"));
      int end = Integer.parseInt(e.getAttributeValue("xtspanend"));
      putDatasectionIntoSaf(procDoc, safHolder, start, end);
      return;      
    }
    TokenSequence tokSeq = null;
    for(TokenSequence t : procDoc.getTokenSequences()) {
      if(t.getElem() == e) tokSeq = t;
    }
    int dataSectionStartOffset = -1;
    if(tokSeq != null) {
      for(Token t : tokSeq.getTokens()) {
        //System.out.println(t.getValue());
        Matcher m = dataStartPattern.matcher(t.getValue());
        if(m.matches() || (t.getValue().matches("1H|13C") && t.getNAfter(1) != null && t.getNAfter(1).getValue().equals("NMR"))) {
          double checkVal = checkCut(tokSeq, t.getId());
          //System.out.println(checkVal);

View Full Code Here

    
    StringSource ss = new StringSource(files, false);
    boolean doSort = true;
    for(String s : ss) {
      s = s.replaceAll("\\s+", " ");
      TokenSequence t = Tokeniser.getInstance().tokenise(s);
      for(int i=0;i<t.size();i++) {
        Token token = t.getToken(i);
        if(token.getValue().equalsIgnoreCase(word1)) {
          for(int j=i+1;j<t.size() && j<i+10;j++) {
            Token token2 = t.getToken(j);
            if(token2.getValue().equalsIgnoreCase(word2)) {
              int wstart1 = token.getStart();
              int wend1 = token.getEnd();
              
              int wstart2 = token2.getStart();

View Full Code Here

    //files = files.subList(0, 10);
    
    
    Bag<String> tokenBag = new Bag<String>();
    
    TokenSequenceSource tss = new TokenSequenceSource(files);
    int i=0;
    for(TokenSequence ts : tss) {
      for(Token t : ts.getTokens()) {
        tokenBag.add(t.getValue().intern());
      }

View Full Code Here

      corpusOffset++;
    }
  }
  
  public InverseSearcher(List<File> files) throws Exception {
    TokenSequenceSource tss = new TokenSequenceSource(files);


    corpusArray = new ArrayList<Integer>();
    offsetArray = new ArrayList<Integer>();
    tokenList = new ArrayList<String>();
    tokenIndex = new HashMap<String,Integer>();

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.TokenSequence

dk.brics.automaton.Automaton

dk.brics.automaton.RunAutomaton

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.