Package edu.harvard.wcfia.yoshikoder.document.tokenizer

Examples of edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenizationCache


        // TODO remove redundant code here!
       
        // FIRST DOC
        YKDocument doc1 = (YKDocument)docs.get(0);
        // tokenize the document
        TokenizationCache tcache = yoshikoder.getTokenizationCache();
        TokenList tl1 = tcache.getTokenList(doc1);
        if (tl1 == null)
          tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
       
        // for _all_ categories
        EntryFrequencyMap efm1 = new EntryFrequencyMap(dict.getDictionaryRoot(), tl1);
        List lkeys = efm1.getSortedCategoryEntries();
        Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
        int[] counts = new int[keys.length+1];
        for (int ii=0; ii<keys.length; ii++) {
          Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
          counts[ii] = cnt.intValue();
        }
        // add N
        counts[keys.length] = efm1.getTokenTotal();

        HSSFWorkbook wb = new HSSFWorkbook();
        HSSFRow row;
        HSSFCell cell;

        HSSFSheet sheet = wb.createSheet("Category frequencies");

        // header
        row = sheet.createRow((short)0);
        for (int c=0; c<keys.length; c++){
          cell = row.createCell((short)(c+1));
          cell.setEncoding(HSSFCell.ENCODING_UTF_16);
          String nodepath = efm1.getEntryPath(keys[c]);
          cell.setCellValue(nodepath);
        }
        cell = row.createCell((short)(keys.length+1));
        cell.setEncoding(HSSFCell.ENCODING_UTF_16);
        cell.setCellValue("Total");

        int rownum = 1;
        for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          TokenList tl2 = tcache.getTokenList(d);
          if (tl2 == null)
            tl2 = TokenizationService.getTokenizationService().tokenize(d);
          Concordance conc = dict.getConcordance(tl2, catnode, wsize);
         
          // note _all_categories counted (implicitly around catnode matches)
View Full Code Here


    tworker = new TaskWorker(yoshikoder){
      protected void doWork() throws Exception {
        // FIRST DOC
        YKDocument doc1 = (YKDocument)docs.get(0);
        // tokenize the document
        TokenizationCache tcache = yoshikoder.getTokenizationCache();
        TokenList tl1 = tcache.getTokenList(doc1);
        if (tl1 == null)
          tl1 = TokenizationService.getTokenizationService().tokenize(doc1);

        // compute the dictionary counts
        EntryFrequencyMap efm1 = new EntryFrequencyMap(dict.getDictionaryRoot(), tl1);
        List lkeys = efm1.getSortedCategoryEntries();
        Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
        int[] counts = new int[keys.length+1];
        for (int ii=0; ii<keys.length; ii++) {
          Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
          counts[ii] = cnt.intValue();
        }
        // add N
        counts[keys.length] = efm1.getTokenTotal();

        for (int ii = 0; ii < keys.length; ii++) {
          String nodepath = efm1.getEntryPath(keys[ii]);

          writer.write(",");
          writer.write(FileUtil.escapeForCsv(nodepath));
        }
        writer.write(",Total\n");

        // and the rest
        for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          TokenList tl2 = tcache.getTokenList(d);
          if (tl2 == null)
            tl2 = TokenizationService.getTokenizationService().tokenize(d);
          Concordance conc = dict.getConcordance(tl2, catnode, wsize);
         
          counts = getDocumentStats(d.getTitle(), conc, keys, dict.getDictionaryRoot());
View Full Code Here

        super(yk, MakeConcordanceAction.class.getName());       
    }
   
    protected Map<YKDocument,Concordance> makeMultipleDocumentConcordance(YKDocument[] docs, Node n, int wsize) throws TokenizationException, IOException {
      //Concordance con = new ConcordanceImpl(wsize);
      TokenizationCache tcache = yoshikoder.getTokenizationCache();
      TokenizationService service = TokenizationService.getTokenizationService();
      Map<YKDocument,Concordance> map = new HashMap<YKDocument,Concordance>();
      for (YKDocument doc : docs) {
        TokenList tl = tcache.getTokenList(doc);
        if (tl == null){
          tl = service.tokenize(doc);
          tcache.putTokenList(doc, tl);
        }
        Concordance c = yoshikoder.getDictionary().getConcordance(tl, n, wsize);
        map.put(doc, c);
      }
      return map;
View Full Code Here

  }

  protected int[] getDocumentStats(YKDocument doc, Node[] keys, CategoryNode catnode) throws IOException, TokenizationException {
       
    // tokenize the document
    TokenizationCache tcache = yoshikoder.getTokenizationCache();
        TokenList tl1 = tcache.getTokenList(doc);
        if (tl1 == null)
            tl1 = TokenizationService.getTokenizationService().tokenize(doc);
       
        // compute the dictionary counts
        EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl1);
View Full Code Here

    tworker = new TaskWorker(yoshikoder){
            protected void doWork() throws Exception {
              // FIRST DOC
              YKDocument doc1 = (YKDocument)docs.get(0);
              // tokenize the document
            TokenizationCache tcache = yoshikoder.getTokenizationCache();
            TokenList tl1 = tcache.getTokenList(doc1);
                if (tl1 == null)
                  tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
                 YKDictionary dict = yoshikoder.getDictionary();
               
                 // compute the dictionary counts
View Full Code Here

    tworker = new TaskWorker(yoshikoder){
            protected void doWork() throws Exception {
              // FIRST DOC
              YKDocument doc1 = (YKDocument)docs.get(0);
              // tokenize the document
            TokenizationCache tcache = yoshikoder.getTokenizationCache();
            TokenList tl1 = tcache.getTokenList(doc1);
                if (tl1 == null)
                  tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
                 YKDictionary dict = yoshikoder.getDictionary();
               
                 // compute the dictionary counts
View Full Code Here

  }

  // first pass to get vocab
  protected List<String> getVocab(List<YKDocument> docs) throws IOException, TokenizationException {
    Set<String> vocab = new HashSet<String>();
    TokenizationCache tcache = yoshikoder.getTokenizationCache();
    for (YKDocument doc : docs) {
      TokenList tl = tcache.getTokenList(doc);
      if (tl == null){
        tl = TokenizationService.getTokenizationService().tokenize(doc);
        tcache.putTokenList(doc, tl);
      }
      WordFrequencyMap map = new WordFrequencyMap(tl);
      vocab.addAll(map.getVocabularyList());
    }
    List<String> list = new ArrayList<String>();
View Full Code Here

    // write header
    for (String word : vocab)
      writer.write("," + FileUtil.escapeForCsv(word));
    writer.write(",Total\n");
   
    TokenizationCache tcache = yoshikoder.getTokenizationCache();
    for (YKDocument doc : docs) {
      TokenList tl = tcache.getTokenList(doc);
      if (tl == null){
        tl = TokenizationService.getTokenizationService().tokenize(doc);
        tcache.putTokenList(doc, tl);
      }
      WordFrequencyMap map = new WordFrequencyMap(tl);
      writer.write( FileUtil.escapeForCsv(doc.getTitle()) );
      for (String vocabWord: vocab) {
        Integer count = map.getWordCount(vocabWord);
View Full Code Here

        cell = row.createCell((short)(vocab.size()+1));
        cell.setEncoding(HSSFCell.ENCODING_UTF_16);
        cell.setCellValue("Total");
   
        int rowNumber = 1;
    TokenizationCache tcache = yoshikoder.getTokenizationCache();
    for (YKDocument doc : docs) {
      TokenList tl = tcache.getTokenList(doc);
      if (tl == null){
        tl = TokenizationService.getTokenizationService().tokenize(doc);
        tcache.putTokenList(doc, tl);
      }
      WordFrequencyMap map = new WordFrequencyMap(tl);
     
      row = sheet.createRow((short)rowNumber);
      cell = row.createCell((short)0);
View Full Code Here

                protected void doWork() throws Exception {
                    DocumentList dl = new DocumentListImpl();
                    dl.add(doc1);
                    dl.add(doc2);

                    TokenizationCache tcache = yoshikoder.getTokenizationCache();
                    TokenList tl1 = tcache.getTokenList(doc1);
                    TokenList tl2 = tcache.getTokenList(doc2);
                    if (tl1 == null){
                        tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
                        tcache.putTokenList(doc1, tl1);
                    }
                    if (tl2 == null){
                        tl2 = TokenizationService.getTokenizationService().tokenize(doc2);
                        tcache.putTokenList(doc2, tl2);
                    }
                   
                    //YKDictionary dict = yoshikoder.getDictionary();
                    EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl1);
                    EntryFrequencyMap efm2 = new EntryFrequencyMap(catnode, tl2);
View Full Code Here

TOP

Related Classes of edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenizationCache

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.