Package edu.harvard.wcfia.yoshikoder.document.tokenizer

Examples of edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenList


      sb.append(cl.getRightHandSideView());
      sb.append(" ");
    }
    // tokenize and apply dictionary to the 'document'
    YKDocument doc = new YKDocumentImpl(docTitle, sb.toString());
    TokenList tl = TokenizationService.getTokenizationService().tokenize(doc);
    EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl);

    int[] counts = new int[keys.length+1];
    for (int ii=0; ii<keys.length; ii++) {
      Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
View Full Code Here


       
        // FIRST DOC
        YKDocument doc1 = (YKDocument)docs.get(0);
        // tokenize the document
        TokenizationCache tcache = yoshikoder.getTokenizationCache();
        TokenList tl1 = tcache.getTokenList(doc1);
        if (tl1 == null)
          tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
       
        // for _all_ categories
        EntryFrequencyMap efm1 = new EntryFrequencyMap(dict.getDictionaryRoot(), tl1);
        List lkeys = efm1.getSortedCategoryEntries();
        Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
        int[] counts = new int[keys.length+1];
        for (int ii=0; ii<keys.length; ii++) {
          Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
          counts[ii] = cnt.intValue();
        }
        // add N
        counts[keys.length] = efm1.getTokenTotal();

        HSSFWorkbook wb = new HSSFWorkbook();
        HSSFRow row;
        HSSFCell cell;

        HSSFSheet sheet = wb.createSheet("Category frequencies");

        // header
        row = sheet.createRow((short)0);
        for (int c=0; c<keys.length; c++){
          cell = row.createCell((short)(c+1));
          cell.setEncoding(HSSFCell.ENCODING_UTF_16);
          String nodepath = efm1.getEntryPath(keys[c]);
          cell.setCellValue(nodepath);
        }
        cell = row.createCell((short)(keys.length+1));
        cell.setEncoding(HSSFCell.ENCODING_UTF_16);
        cell.setCellValue("Total");

        int rownum = 1;
        for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          TokenList tl2 = tcache.getTokenList(d);
          if (tl2 == null)
            tl2 = TokenizationService.getTokenizationService().tokenize(d);
          Concordance conc = dict.getConcordance(tl2, catnode, wsize);
         
          // note _all_categories counted (implicitly around catnode matches)
View Full Code Here

      protected void doWork() throws Exception {
        // FIRST DOC
        YKDocument doc1 = (YKDocument)docs.get(0);
        // tokenize the document
        TokenizationCache tcache = yoshikoder.getTokenizationCache();
        TokenList tl1 = tcache.getTokenList(doc1);
        if (tl1 == null)
          tl1 = TokenizationService.getTokenizationService().tokenize(doc1);

        // compute the dictionary counts
        EntryFrequencyMap efm1 = new EntryFrequencyMap(dict.getDictionaryRoot(), tl1);
        List lkeys = efm1.getSortedCategoryEntries();
        Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
        int[] counts = new int[keys.length+1];
        for (int ii=0; ii<keys.length; ii++) {
          Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
          counts[ii] = cnt.intValue();
        }
        // add N
        counts[keys.length] = efm1.getTokenTotal();

        for (int ii = 0; ii < keys.length; ii++) {
          String nodepath = efm1.getEntryPath(keys[ii]);

          writer.write(",");
          writer.write(FileUtil.escapeForCsv(nodepath));
        }
        writer.write(",Total\n");

        // and the rest
        for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          TokenList tl2 = tcache.getTokenList(d);
          if (tl2 == null)
            tl2 = TokenizationService.getTokenizationService().tokenize(d);
          Concordance conc = dict.getConcordance(tl2, catnode, wsize);
         
          counts = getDocumentStats(d.getTitle(), conc, keys, dict.getDictionaryRoot());
View Full Code Here

      //Concordance con = new ConcordanceImpl(wsize);
      TokenizationCache tcache = yoshikoder.getTokenizationCache();
      TokenizationService service = TokenizationService.getTokenizationService();
      Map<YKDocument,Concordance> map = new HashMap<YKDocument,Concordance>();
      for (YKDocument doc : docs) {
        TokenList tl = tcache.getTokenList(doc);
        if (tl == null){
          tl = service.tokenize(doc);
          tcache.putTokenList(doc, tl);
        }
        Concordance c = yoshikoder.getDictionary().getConcordance(tl, n, wsize);
View Full Code Here

  protected int[] getDocumentStats(YKDocument doc, Node[] keys, CategoryNode catnode) throws IOException, TokenizationException {
       
    // tokenize the document
    TokenizationCache tcache = yoshikoder.getTokenizationCache();
        TokenList tl1 = tcache.getTokenList(doc);
        if (tl1 == null)
            tl1 = TokenizationService.getTokenizationService().tokenize(doc);
       
        // compute the dictionary counts
        EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl1);
View Full Code Here

            protected void doWork() throws Exception {
              // FIRST DOC
              YKDocument doc1 = (YKDocument)docs.get(0);
              // tokenize the document
            TokenizationCache tcache = yoshikoder.getTokenizationCache();
            TokenList tl1 = tcache.getTokenList(doc1);
                if (tl1 == null)
                  tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
                 YKDictionary dict = yoshikoder.getDictionary();
               
                 // compute the dictionary counts
View Full Code Here

            protected void doWork() throws Exception {
              // FIRST DOC
              YKDocument doc1 = (YKDocument)docs.get(0);
              // tokenize the document
            TokenizationCache tcache = yoshikoder.getTokenizationCache();
            TokenList tl1 = tcache.getTokenList(doc1);
                if (tl1 == null)
                  tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
                 YKDictionary dict = yoshikoder.getDictionary();
               
                 // compute the dictionary counts
View Full Code Here

                super(name, type);
            }
            public void setPatternEngine(PatternEngine eng){}
            public PatternEngine getPatternEngine(){return null;}
            public long[] test(){
                TokenList tl = new TokenListImpl();
                for (int ii = 0; ii < 1000; ii++) {
                    tl.add(new TokenImpl("china", 0, 5));
                    tl.add(new TokenImpl("sausage", 0, 5));
                }
                PatternNode p = new PatternNodeImpl("chin*", null, Pattern.compile("chin*"));
                System.out.println(p.getPattern());
                long start1 = new Date().getTime();
                Set<Token> l = getMatchingTokens(tl, p);
View Full Code Here

                super(name, type);
            }
            public void setPatternEngine(PatternEngine eng){}
            public PatternEngine getPatternEngine(){return null;}
            public long[] test(){
                TokenList tl = new TokenListImpl();
                for (int ii = 0; ii < 1000; ii++) {
                    tl.add(new TokenImpl("china", 0, 5));
                    tl.add(new TokenImpl("sausage", 0, 5));
                }
                PatternNode p = new PatternNodeImpl("chin*", null, Pattern.compile("chin*"));
                System.out.println(p.getPattern());
                long start1 = new Date().getTime();
                Set<Token> l = getMatchingTokens(tl, p);
View Full Code Here

          Map<YKDocument,EntryFrequencyMap> efmMap;
           
          protected void doWork() throws Exception {
            EntryFrequencyMap efm = null;
            for (YKDocument doc : concmap.keySet()) {
              TokenList tlist = new TokenListImpl();
              Concordance conc = concmap.get(doc);
              for (Iterator iter = conc.iterator(); iter.hasNext();) {
                ConcordanceLine line = (ConcordanceLine) iter.next();
                for (Iterator iterator = line.getLeftHandSide().iterator(); iterator.hasNext();) {
                  Token token = (Token) iterator.next();
                  tlist.add(token);
                }
                for (Iterator iterator = line.getRightHandSide().iterator(); iterator.hasNext();) {
                  Token token = (Token) iterator.next();
                  tlist.add(token);
                }
              }
              efm = new EntryFrequencyMap(yoshikoder.getDictionary(), tlist);
              efmMap.put(doc, efm);
            }
View Full Code Here

TOP

Related Classes of edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenList

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.