Package uk.ac.cam.ch.wwmm.oscar3.flow

Examples of uk.ac.cam.ch.wwmm.oscar3.flow.OscarFlow


                writeMode2Doc(doc2);
        }

        public void autoAnnotate() throws Exception {
                        clearAnnotations();
                        OscarFlow oscarFlow = new OscarFlow(doc);
                        oscarFlow.runFlow("recognise inline");
                        doc = ARTSciXMLDocument.makeFromDoc(oscarFlow.getInlineXML());
                        writeScrapBook();
               
                        // Adapt this and clear annotations for mode 2
                        // need a SciXMLDocument object from mode2.xml using Builder
                        // need a writeMode2 method
View Full Code Here


               
        }
       
        public ARTSciXMLDocument autoAnnotate2() throws Exception {
                clearAnnotations2();
                OscarFlow oscarFlow = new OscarFlow(doc2);
                oscarFlow.runFlow("recognise inline");
                doc2 = ARTSciXMLDocument.makeFromDoc(oscarFlow.getInlineXML());
                writeMode2Doc(doc2);
                return doc2;
        }
View Full Code Here

        out.close();
        return;
      }
      /* Process the document */     

      OscarFlow oscarFlow = new OscarFlow(plainDoc);
     
      if(request.getParameter("flowcommand") != null &&
          request.getParameter("flowcommand").trim().length() > 0) {
        oscarFlow.runFlow(request.getParameter("flowcommand"));
      } else if(outputMode.equals("data")) {
        oscarFlow.parseData();
      } else {
        oscarFlow.processLite();
      }
      if(outputMode.equals("default") && request.getParameter("flowcommand") != null &&
          !request.getParameter("flowcommand").toLowerCase().contains("inline")) {
        outputMode = "saf";
      }
     
      if(outputMode.equalsIgnoreCase("custom")) {
        String name = request.getParameter("name");
        if(name.endsWith(".xml")) {
          response.setContentType("application/xml");
          response.setCharacterEncoding("UTF-8");         
        } else if(name.endsWith(".htm") || name.endsWith(".html")) {
          response.setContentType("text/html");
          response.setCharacterEncoding("UTF-8");
        } else {
          response.setContentType("text/plain");
          response.setCharacterEncoding("UTF-8");
        }
      } else {
        response.setContentType("application/xml");
        response.setCharacterEncoding("UTF-8");
      }
     
      OutputStream out = response.getOutputStream();
      if(outputMode.equalsIgnoreCase("saf")) {
        new Serializer(out).write(oscarFlow.getSafXML());       
      } else if(outputMode.equalsIgnoreCase("data")) {
        new Serializer(out).write(oscarFlow.getDataXML());       
      } else if(outputMode.equalsIgnoreCase("genia")) {
        new Serializer(out).write(oscarFlow.getGeniaSAF());
      } else if(outputMode.equalsIgnoreCase("relation")) {
        new Serializer(out).write(oscarFlow.getRelationXML());
      } else if(outputMode.equalsIgnoreCase("custom")) {
        String name = request.getParameter("name");
        oscarFlow.writeCustomeOutputToStream(name, out);
      } else {
        // default to markedup
        doc = SciXMLDocument.makeFromDoc(oscarFlow.getInlineXML());
        doc.addServerProcessingInstructions();
        new Serializer(out).write(doc);
        out.close();
      }
     
View Full Code Here

   
    String s = rg.getString("testcard.txt");
    assertTrue("Have testcard string", s != null && s.length() > 0);
    Document doc = TextToSciXML.textToSciXML(s);
    assertTrue("Have SciXML document", doc != null);
    OscarFlow flow = new OscarFlow(doc);
    flow.runFlow("recognise resolve numbersaf inline data");
    assertNotNull("Have source XML", flow.getSourceXML());
    assertNotNull("Have inline XML", flow.getInlineXML());
    assertNotNull("Have SAF XML", flow.getSafXML());
    assertNotNull("Have data XML", flow.getDataXML());
   
    SAFTester tester = new SAFTester(flow.getSafXML());
    assertTrue("At least one annot", tester.atLeastOne(new AllElementsFilter()));
    assertTrue("Null filter gives no annots", tester.no(new NoElementsFilter()));
    assertTrue("At least one annot", tester.atLeastOne(new NoElementsFilter().setNegate(true)));
    assertTrue("Null filter gives no annots", tester.no(new AllElementsFilter().setNegate(true)));
    assertTrue("At least one annot", tester.atLeastOne(new AllElementsFilter().chain(new AllElementsFilter())));
View Full Code Here

  }
 
  public void testTestCard2() throws Exception {
    String s = rg.getString("testcard2.txt");
    Document doc = TextToSciXML.textToSciXML(s);
    OscarFlow flow = new OscarFlow(doc);
    flow.runFlow("recognise resolve");
    assertNotNull("Have SAF XML", flow.getSafXML());
   
    SAFTester tester = new SAFTester(flow.getSafXML());
    assertTrue("Has a CM", tester.atLeastOne(new TypeFilter("CM")));
    assertTrue("Has an ONT", tester.atLeastOne(new TypeFilter("ONT")));
   
    assertTrue("Has a single-word ONT", tester.atLeastOne(new TypeFilter("ONT").chain(new SurfaceRegexFilter("[a-z]+"))));
    assertTrue("Has a multi-word ONT", tester.atLeastOne(new TypeFilter("ONT").chain(new SurfaceRegexFilter("([a-z]+ )+[a-z]+"))));
View Full Code Here

      System.out.println(s + "\t" + tfIdf.get(s));
   
  }
 
  public static Map<String,Double> excessAnalyseCluster(Map<Integer,Float> cluster, IndexReader ir, double threshold, boolean enriched) throws Exception {
    LuceneChemicalIndex lci = new LuceneIndexerSearcher(false).getLci();
    Set<String> inchis = new HashSet<String>();
    Set<String> onts = new HashSet<String>();
   
    List<File> clusterFiles = new ArrayList<File>();
    for(Integer i : cluster.keySet()) {
      clusterFiles.add(new File(ir.document(i).getField("filename").stringValue().replaceAll("markedup", "source")));
      if(enriched) {
        TermFreqVector tvf = ir.getTermFreqVector(i, "InChI");
        if(tvf != null) {
          String [] termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            inchis.add(termArray[j]);
          }       
        }
        tvf = ir.getTermFreqVector(i, "Ontology");
        if(tvf != null) {
          String [] termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            onts.add(termArray[j]);
          }       
        }       
      }
    }
    NGramTfDf ngtd = NGramTfDf.analyseFiles(clusterFiles);
    ngtd.calculateNGrams();
    Bag<String> df = ngtd.getDfBag(1);
    df.discardInfrequent(2);
    Map<String,Double> scores = new HashMap<String,Double>();
    int numDocs = ir.numDocs();
    int clusterSize = cluster.size();
    double scaleFactor = clusterSize * 1.0 / numDocs;
    IndexSearcher is = new IndexSearcher(ir);
    for(String s : df.getSet()) {
      //System.out.println(s);
      int docFreq = 0;
      Query q;
      if(s.matches("\\S+")) {
        TermQuery tq = new TermQuery(new Term("txt", s));
        q = tq;
        //docFreq = ir.docFreq(new Term("txt", s));
      } else {
        PhraseQuery pq = new PhraseQuery();
        for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
        q = pq;
      }
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      docFreq = vc.getResultsVector().size();
      double score;
      double expected = scaleFactor * docFreq;
      double excess = df.getCount(s) - expected;
      score = excess / clusterSize;       
      if(score > threshold) scores.put(s, score);
    }
    Stemmer st = new Stemmer(new EnglishStemmer());
    Map<String,List<String>> stems = st.wordsToStems(df.getSet());
    for(String stem : stems.keySet()) {
      List<String> words = stems.get(stem);
      if(words.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String word : words) {
          bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          df.add(stems.get(stem).toString(), overlap);
          scores.put(stems.get(stem).toString(), score);
        }
      }
    }
    Map<String,List<String>> termStems = ngtd.ngramsByStem();
    for(String stem : termStems.keySet()) {
      List<String> multiWords = termStems.get(stem);
      if(multiWords.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String multiWord : multiWords) {
          PhraseQuery pq = new PhraseQuery();
          for(String ss : StringTools.arrayToList(multiWord.split("\\s+"))) pq.add(new Term("txt", ss));
          bq.add(new BooleanClause(pq, Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          df.add(termStems.get(stem).toString(), overlap);
          scores.put(termStems.get(stem).toString(), score);
        }
      }
    }
    if(enriched) {
      for(String inchi : inchis) {
        Term luceneTerm = new Term("InChI", inchi);
        Query q = new TermQuery(luceneTerm);
        VectorCollector vc = new VectorCollector();
        is.search(q, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        if(overlap < 2) continue;
        double excess = overlap - expected;
        double score = excess / clusterSize;
       
        if(score > threshold) {
          String s = "InChi: " + lci.getName(lci.hitsByInChI(inchi));
          scores.put(s, score);
          df.add(s, overlap);           
        }
      }
     
View Full Code Here

    return scores;
  }

 
  public static void analyseCluster(Map<Integer,Float> cluster, IndexReader ir, DocVectorSimilarity similarity, double threshold) throws Exception {
    LuceneChemicalIndex lci = new LuceneIndexerSearcher(false).getLci();
    List<File> clusterFiles = new ArrayList<File>();
    Bag<String> dfs = new Bag<String>();
    Set<String> inchis = new HashSet<String>();
    Set<String> onts = new HashSet<String>();
    for(Integer i : cluster.keySet()) {
      cluster.put(i, 1.0f);
      TermFreqVector tvf = ir.getTermFreqVector(i, "txt");
      String [] termArray = tvf.getTerms();
      for(int j=0;j<termArray.length;j++) {
        dfs.add(termArray[j]);
      }
      if(false) {
        tvf = ir.getTermFreqVector(i, "InChI");
        if(tvf != null) {
          termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            inchis.add(termArray[j]);
          }       
        }
        tvf = ir.getTermFreqVector(i, "Ontology");
        if(tvf != null) {
          termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            onts.add(termArray[j]);
          }       
        }       
      }
     

      clusterFiles.add(new File(ir.document(i).getField("filename").stringValue().replaceAll("markedup", "source")));
    }
    Stemmer st = new Stemmer(new EnglishStemmer());
    Map<String,List<String>> stems = st.wordsToStems(dfs.getSet());

    dfs.discardInfrequent(2);
    NGramTfDf ngtd = NGramTfDf.analyseFiles(clusterFiles);
    ngtd.calculateNGrams();
    Bag<String> bs = ngtd.getDfBag(2);
    bs.discardInfrequent(2);
    Map<String,List<String>> termStems = ngtd.ngramsByStem();

    Map<String,Double> scores = new HashMap<String,Double>();
    Map<String,Integer> overlaps = new HashMap<String,Integer>();
    IndexSearcher is = new IndexSearcher(ir);
    int docTotal = ir.numDocs();
    for(String term : dfs.getSet()) {
      if(TermSets.getClosedClass().contains(term) || term.matches("[^A-Za-z]+")) continue;
      Term luceneTerm = new Term("txt", term);
      Query q = new TermQuery(luceneTerm);
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      double score = similarity.similarity(cluster, vc.getResultsVector());
      if(score > threshold) {
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        if(overlap > 1) {
          scores.put(term, score);
          overlaps.put(term, overlap);           
        }
      }
    }
    for(String stem : stems.keySet()) {
      List<String> words = stems.get(stem);
      if(words.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String word : words) {
          bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double score = similarity.similarity(cluster, vc.getResultsVector());
        if(score > threshold) {
          String s = words.toString();
          int overlap = overlapDocs(vc.getResultsVector(), cluster);
          if(overlap > 1) {
            scores.put(s, score);
            overlaps.put(s, overlap);           
          }
        }
      }
    }
    for(String stem : termStems.keySet()) {
      List<String> multiWords = termStems.get(stem);
      if(multiWords.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String multiWord : multiWords) {
          PhraseQuery pq = new PhraseQuery();
          for(String ss : StringTools.arrayToList(multiWord.split("\\s+"))) pq.add(new Term("txt", ss));
          bq.add(new BooleanClause(pq, Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double score = similarity.similarity(cluster, vc.getResultsVector());
        if(score > threshold) {
          String s = multiWords.toString();
          int overlap = overlapDocs(vc.getResultsVector(), cluster);
          if(overlap > 1) {
            scores.put(s, score);
            overlaps.put(s, overlap);           
          }
        }
      }
    }
    for(String s : bs.getList()) {
      if(!s.matches(".*\\s+.*")) continue;
      PhraseQuery pq = new PhraseQuery();
      for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
      VectorCollector vc = new VectorCollector();
      is.search(pq, vc);
      double score = similarity.similarity(cluster, vc.getResultsVector());
      if(score > threshold) {
        scores.put(s, score);
        overlaps.put(s, overlapDocs(vc.getResultsVector(), cluster));
      }
    }
   

    if(false) {
      for(String inchi : inchis) {
        Term luceneTerm = new Term("InChI", inchi);
        Query q = new TermQuery(luceneTerm);
        VectorCollector vc = new VectorCollector();
        is.search(q, vc);
        double score = similarity.similarity(cluster, vc.getResultsVector());
        if(score > threshold) {
          int overlap = overlapDocs(vc.getResultsVector(), cluster);
          if(overlap > 1) {
            String s = "InChi: " + lci.getName(lci.hitsByInChI(inchi));
            scores.put(s, score);
            overlaps.put(s, overlap);           
          }
        }
      }
View Full Code Here

  /**
   * @param args
   */
  public static void main(String[] args) throws Exception {
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
    IndexSearcher is = lis.getIndexSearcher();

    Stemmer stemmerTools = new Stemmer(new EnglishStemmer());
   
    //QueryParser qp = new Oscar3QueryParser("txt", new Oscar3Analyzer(), lis, false);
    //Query q = qp.parse("NaCl");
   
    String queryTerm = "lipid";
    //PhraseQuery pq = new PhraseQuery();
    //pq.add(new Term("txt", "aromatase"));
    //pq.add(new Term("txt", "inhibitors"));
    Query q = new TermQuery(new Term("txt", queryTerm));
    //Query q = new StemQuery(new Term("txt", queryTerm), stemmerTools);

    for(int i=0;i<100;i++) {
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      for(Integer j : new ArrayList<Integer>(vc.getResultsVector().keySet())) {
        if(vc.getResultsVector().get(j) < 0.2) vc.getResultsVector().remove(j);
      }
      Map<String,Double> scores = ClusterAnalyser.simpleExcessAnalyseCluster(vc.getResultsVector(), lis.getIndexReader(), 0.1);
      BooleanQuery bq = new BooleanQuery(false);
      List<String> terms = StringTools.getSortedList(scores);
      if(terms.size() > 10) terms = terms.subList(0, 10);
      for(String s : terms) {
        System.out.println(s + "\t" + scores.get(s));
View Full Code Here

   */
  public static void main(String[] args) throws Exception {
    //System.out.println(StringTools.arrayToList(StandardAnalyzer.STOP_WORDS));
    //if(true) return;
   
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);

    String queryTerm = "cyp2d6";
   
    /*List<String> queryList = StringTools.arrayToList(new String[] {
    "cyp2d6",
    "cyp3a4",
    "dextromethorphan"});*/
    List<String> queryList = new ArrayList<String>();
    long time = System.currentTimeMillis();
    IndexReader ir = lis.getIndexReader();
    TermEnum termEnum = ir.terms();
    while(termEnum.next()) {
      Term t = termEnum.term();
      if(t.field().equals("txt") && termEnum.docFreq() > 20) queryList.add(t.text());
    }
    //queryList.addAll(lis.termsFromQuery(new TermQuery(new Term("txt", queryTerm))));
    System.out.println("All terms loaded: " + (System.currentTimeMillis() - time));
   
    time = System.currentTimeMillis();
    Map<String,Map<Integer,Float>> vectors = new HashMap<String,Map<Integer,Float>>();
    for(String query : queryList) {
      //System.out.println(query);
      vectors.put(query, lis.getScoresVectorForQuery(new TermQuery(new Term("txt", query))));
    }
    /*PhraseQuery pq = new PhraseQuery();
    pq.add(new Term("txt", "cyp2d6"));
    pq.add(new Term("txt", "inhibitors"));
    vectors.put("cyp2d6 inhibitors", lis.getScoresVectorForQuery(pq));*/
 
View Full Code Here

 
  /**
   * @param args
   */
  public static void main(String[] args) throws Exception{
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
    IndexReader ir = lis.getIndexReader();
   
    int numDocs = ir.numDocs();
    TermEnum textEnum = ir.terms();
    Map<String,Integer> docFreqs = new HashMap<String,Integer>();
    while(textEnum.next()) {
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.flow.OscarFlow

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.