Examples of edu.harvard.wcfia.yoshikoder.document.YKDocument

Package edu.harvard.wcfia.yoshikoder.document

Examples of edu.harvard.wcfia.yoshikoder.document.YKDocument

edu.harvard.wcfia.yoshikoder.document.YKDocument


    public String toString(){
        StringBuffer sb = new StringBuffer();
        sb.append("{");
        for (Iterator iter = map.keySet().iterator(); iter.hasNext();) {
            YKDocument doc = (YKDocument) iter.next();
            sb.append(doc.getTitle() + ": ");
            SoftReference sr = (SoftReference)map.get(doc);
            TokenList tl = (TokenList)sr.get();
            if (tl == null)
                sb.append("[null],");
            else

View Full Code Here

      sb.append(" ");
      sb.append(cl.getRightHandSideView());
      sb.append(" ");
    }
    // tokenize and apply dictionary to the 'document'
    YKDocument doc = new YKDocumentImpl(docTitle, sb.toString());
    TokenList tl = TokenizationService.getTokenizationService().tokenize(doc);
    EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl);


    int[] counts = new int[keys.length+1];
    for (int ii=0; ii<keys.length; ii++) {

View Full Code Here

    tworker = new TaskWorker(yoshikoder){
      protected void doWork() throws Exception {
        // TODO remove redundant code here!
        
        // FIRST DOC
        YKDocument doc1 = (YKDocument)docs.get(0);
        // tokenize the document
        TokenizationCache tcache = yoshikoder.getTokenizationCache();
        TokenList tl1 = tcache.getTokenList(doc1);
        if (tl1 == null)
          tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
        
        // for _all_ categories
        EntryFrequencyMap efm1 = new EntryFrequencyMap(dict.getDictionaryRoot(), tl1);
        List lkeys = efm1.getSortedCategoryEntries();
        Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
        int[] counts = new int[keys.length+1];
        for (int ii=0; ii<keys.length; ii++) {
          Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
          counts[ii] = cnt.intValue();
        }
        // add N
        counts[keys.length] = efm1.getTokenTotal();


        HSSFWorkbook wb = new HSSFWorkbook();
        HSSFRow row;
        HSSFCell cell;


        HSSFSheet sheet = wb.createSheet("Category frequencies");


        // header
        row = sheet.createRow((short)0);
        for (int c=0; c<keys.length; c++){
          cell = row.createCell((short)(c+1));
          cell.setEncoding(HSSFCell.ENCODING_UTF_16);
          String nodepath = efm1.getEntryPath(keys[c]);
          cell.setCellValue(nodepath);
        }
        cell = row.createCell((short)(keys.length+1));
        cell.setEncoding(HSSFCell.ENCODING_UTF_16);
        cell.setCellValue("Total");


        int rownum = 1;
        for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          TokenList tl2 = tcache.getTokenList(d);
          if (tl2 == null)
            tl2 = TokenizationService.getTokenizationService().tokenize(d);
          Concordance conc = dict.getConcordance(tl2, catnode, wsize);
          
          // note _all_categories counted (implicitly around catnode matches)
          counts = getDocumentStats(d.getTitle(), conc, keys, dict.getDictionaryRoot());


          row = sheet.createRow((short)rownum);
          cell = row.createCell((short)0);
          cell.setEncoding(HSSFCell.ENCODING_UTF_16);
          cell.setCellValue(d.getTitle());


          for (int ii = 0; ii < keys.length; ii++) {
            cell = row.createCell((short)(ii+1));
            cell.setCellValue((double)counts[ii]);
          }

View Full Code Here

    final int wsize = winsize; 
    
    tworker = new TaskWorker(yoshikoder){
      protected void doWork() throws Exception {
        // FIRST DOC
        YKDocument doc1 = (YKDocument)docs.get(0);
        // tokenize the document
        TokenizationCache tcache = yoshikoder.getTokenizationCache();
        TokenList tl1 = tcache.getTokenList(doc1);
        if (tl1 == null)
          tl1 = TokenizationService.getTokenizationService().tokenize(doc1);


        // compute the dictionary counts
        EntryFrequencyMap efm1 = new EntryFrequencyMap(dict.getDictionaryRoot(), tl1);
        List lkeys = efm1.getSortedCategoryEntries();
        Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
        int[] counts = new int[keys.length+1];
        for (int ii=0; ii<keys.length; ii++) {
          Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
          counts[ii] = cnt.intValue();
        }
        // add N
        counts[keys.length] = efm1.getTokenTotal();


        for (int ii = 0; ii < keys.length; ii++) {
          String nodepath = efm1.getEntryPath(keys[ii]);


          writer.write(",");
          writer.write(FileUtil.escapeForCsv(nodepath));
        }
        writer.write(",Total\n");


        // and the rest
        for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          TokenList tl2 = tcache.getTokenList(d);
          if (tl2 == null)
            tl2 = TokenizationService.getTokenizationService().tokenize(d);
          Concordance conc = dict.getConcordance(tl2, catnode, wsize);
          
          counts = getDocumentStats(d.getTitle(), conc, keys, dict.getDictionaryRoot());


          writer.write(FileUtil.escapeForCsv(d.getTitle()));
          for (int ii = 0; ii < keys.length; ii++) {
            writer.write("," + counts[ii]);
          }
          writer.write("," + counts[keys.length] + "\n");
        }

View Full Code Here

    public ExportDocumentAsUTF16Action(Yoshikoder yk) {
        super(yk, ExportDocumentAsUTF16Action.class.getName());
    }
    
    public void actionPerformed(ActionEvent e) {        
        final YKDocument doc = yoshikoder.getSelectedDocument();
        if (doc == null) return;
        
        if (documentExporter==null)
            documentExporter = 
                DialogUtil.makeFileDialog(yoshikoder, 
                        "Export Document as UTF-16", 
                        FileDialog.SAVE, null); // TODO loc        
        
        documentExporter.setFile(null);
        documentExporter.show();
        String fname = documentExporter.getFile();
        if (fname == null) return;
        
        File filed = new File(documentExporter.getDirectory(), fname);
        final File file = FileUtil.suffix( filed, "txt");
        tworker = new TaskWorker(yoshikoder){
            protected void doWork() throws Exception {
                FileUtil.save(file, doc.getText(), "UTF-16"); 
            }
            protected void onError() {
                DialogUtil.yelp(yoshikoder, "Could not export the document", e); // TODO loc
            }
        };

View Full Code Here

    final CategoryNode catnode = node;
    
    tworker = new TaskWorker(yoshikoder){
            protected void doWork() throws Exception {
              // FIRST DOC
              YKDocument doc1 = (YKDocument)docs.get(0);
              // tokenize the document
            TokenizationCache tcache = yoshikoder.getTokenizationCache();
            TokenList tl1 = tcache.getTokenList(doc1);
                if (tl1 == null)
                  tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
                 YKDictionary dict = yoshikoder.getDictionary();
                
                 // compute the dictionary counts
                EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl1);
                List lkeys = efm1.getSortedCategoryEntries();
                Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
                int[] counts = new int[keys.length+1];
            for (int ii=0; ii<keys.length; ii++) {
              Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
              counts[ii] = cnt.intValue();
            }
            // add N
            counts[keys.length] = efm1.getTokenTotal();
              
            HSSFWorkbook wb = new HSSFWorkbook();
                HSSFRow row;
                HSSFCell cell;
                
                HSSFSheet sheet = wb.createSheet("Category frequencies");
                
                // header
                row = sheet.createRow((short)0);
                for (int c=0; c<keys.length; c++){
                  cell = row.createCell((short)(c+1));
                    cell.setEncoding(HSSFCell.ENCODING_UTF_16);
                    String nodepath = efm1.getEntryPath(keys[c]);
                    cell.setCellValue(nodepath);
                }
                cell = row.createCell((short)(keys.length+1));
                cell.setEncoding(HSSFCell.ENCODING_UTF_16);
                cell.setCellValue("Total");


                int rownum = 1;
                for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          counts = getDocumentStats(d, keys, catnode);


          row = sheet.createRow((short)rownum);
          cell = row.createCell((short)0);
                    cell.setEncoding(HSSFCell.ENCODING_UTF_16);
                    cell.setCellValue(d.getTitle());
          
          for (int ii = 0; ii < keys.length; ii++) {
            cell = row.createCell((short)(ii+1));
            cell.setCellValue((double)counts[ii]);
                  }

View Full Code Here

    final CategoryNode catnode = node;
    
    tworker = new TaskWorker(yoshikoder){
            protected void doWork() throws Exception {
              // FIRST DOC
              YKDocument doc1 = (YKDocument)docs.get(0);
              // tokenize the document
            TokenizationCache tcache = yoshikoder.getTokenizationCache();
            TokenList tl1 = tcache.getTokenList(doc1);
                if (tl1 == null)
                  tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
                 YKDictionary dict = yoshikoder.getDictionary();
                
                 // compute the dictionary counts
                EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl1);
                List lkeys = efm1.getSortedCategoryEntries();
                Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
                int[] counts = new int[keys.length+1];
            for (int ii=0; ii<keys.length; ii++) {
              Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
              counts[ii] = cnt.intValue();
            }
            // add N
            counts[keys.length] = efm1.getTokenTotal();
              
            for (int ii = 0; ii < keys.length; ii++) {
          String nodepath = efm1.getEntryPath(keys[ii]);
          
          writer.write(",");
          writer.write(FileUtil.escapeForCsv(nodepath));
              }
            writer.write(",Total\n");
        
            // and the rest
            for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          counts = getDocumentStats(d, keys, catnode);


          writer.write(FileUtil.escapeForCsv(d.getTitle()));
                for (int ii = 0; ii < keys.length; ii++) {
            writer.write("," + counts[ii]);
                  }
                writer.write("," + counts[keys.length] + "\n");
        }

View Full Code Here

  
  abstract public void reactToDoubleClickedConcordanceLine();
  
  public static void main(String[] args) throws Exception{
    
    YKDocument doc1 = YKDocumentFactory.createYKDocument(new File("/Users/will/Desktop/bundestagdebate/14235.txt"), 
        "doc1", "UTF-8", Locale.GERMAN);
    YKDocument doc2 = YKDocumentFactory.createYKDocument(new File("/Users/will/Desktop/bundestagdebate/14235.txt"), 
        "doc2", "UTF-8", Locale.GERMAN);
    TokenStructuredDocument tsdoc1 = new TokenStructuredDocument(doc1);
    TokenStructuredDocument tsdoc2 = new TokenStructuredDocument(doc2);
    String[][] conc1 = tsdoc1.getPatternConcordance(Pattern.compile("vier"), 5);
    String[][] conc2 = tsdoc2.getPatternConcordance(Pattern.compile("vier"), 5);

View Full Code Here

    System.err.println("Make character span [" + span[0] + ", " + span[1] + "] in document '" + doc.getTitle() + "' visible");
  }
  
  public static void main(String[] args) throws IOException {


    YKDocument doc1 = YKDocumentFactory.createYKDocument(new File("/Users/will/Documents/german-manifestos-econ/ORIG_ECON_CDU1990.txt"),
        "CDU91990", "ISO-8859-1", Locale.GERMAN);
    YKDocument doc2 = YKDocumentFactory.createYKDocument(new File("/Users/will/Documents/german-manifestos-econ/ORIG_ECON_CDU1994.txt"),
        "CDU91994", "ISO-8859-1", Locale.GERMAN);


    TokenStructuredDocument tsd1 = new TokenStructuredDocument(doc1);
    TokenStructuredDocument tsd2 = new TokenStructuredDocument(doc2);

View Full Code Here

    else 
      return "";
  }


  public static void main(String[] args) throws IOException {
    YKDocument d1 = YKDocumentFactory.createDummyDocument("dummy1", "1 2 3 4 5 6 7 8 9", "UTF-8");
    YKDocument d2 = YKDocumentFactory.createDummyDocument("dummy2", "5 6 7 8 9 10 11 12", "UTF-8");
    TokenStructuredDocument tsd1 = new TokenStructuredDocument(d1);
    TokenStructuredDocument tsd2 = new TokenStructuredDocument(d2);
    Pattern[] p5 = new Pattern[]{Pattern.compile("9")};
    Pattern[] p34 = new Pattern[]{Pattern.compile("2"), Pattern.compile("3"), Pattern.compile("4")};
    List<TokenStructuredDocument> dlist = new ArrayList<TokenStructuredDocument>();

View Full Code Here

0 1 2 3

TOP

Related Classes of edu.harvard.wcfia.yoshikoder.document.YKDocument

edu.harvard.wcfia.yoshikoder.AddDocumentAction

edu.harvard.wcfia.yoshikoder.AddHighlightsAction

edu.harvard.wcfia.yoshikoder.concordance.DocumentConcordanceTableModel

edu.harvard.wcfia.yoshikoder.ConcordanceFrequencyReportAction

edu.harvard.wcfia.yoshikoder.DictionaryRRDocumentComparisonAction

edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenCache

edu.harvard.wcfia.yoshikoder.ExportDocumentAsUTF16Action

edu.harvard.wcfia.yoshikoder.ExportDocumentAsUTF8Action

edu.harvard.wcfia.yoshikoder.MultipleConcordanceFrequencyReportAction

edu.harvard.wcfia.yoshikoder.reporting.AbstractReport

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.