Package edu.harvard.wcfia.yoshikoder.document

Examples of edu.harvard.wcfia.yoshikoder.document.YKDocument


    public String toString(){
        StringBuffer sb = new StringBuffer();
        sb.append("{");
        for (Iterator iter = map.keySet().iterator(); iter.hasNext();) {
            YKDocument doc = (YKDocument) iter.next();
            sb.append(doc.getTitle() + ": ");
            SoftReference sr = (SoftReference)map.get(doc);
            TokenList tl = (TokenList)sr.get();
            if (tl == null)
                sb.append("[null],");
            else
View Full Code Here


      sb.append(" ");
      sb.append(cl.getRightHandSideView());
      sb.append(" ");
    }
    // tokenize and apply dictionary to the 'document'
    YKDocument doc = new YKDocumentImpl(docTitle, sb.toString());
    TokenList tl = TokenizationService.getTokenizationService().tokenize(doc);
    EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl);

    int[] counts = new int[keys.length+1];
    for (int ii=0; ii<keys.length; ii++) {
View Full Code Here

    tworker = new TaskWorker(yoshikoder){
      protected void doWork() throws Exception {
        // TODO remove redundant code here!
       
        // FIRST DOC
        YKDocument doc1 = (YKDocument)docs.get(0);
        // tokenize the document
        TokenizationCache tcache = yoshikoder.getTokenizationCache();
        TokenList tl1 = tcache.getTokenList(doc1);
        if (tl1 == null)
          tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
       
        // for _all_ categories
        EntryFrequencyMap efm1 = new EntryFrequencyMap(dict.getDictionaryRoot(), tl1);
        List lkeys = efm1.getSortedCategoryEntries();
        Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
        int[] counts = new int[keys.length+1];
        for (int ii=0; ii<keys.length; ii++) {
          Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
          counts[ii] = cnt.intValue();
        }
        // add N
        counts[keys.length] = efm1.getTokenTotal();

        HSSFWorkbook wb = new HSSFWorkbook();
        HSSFRow row;
        HSSFCell cell;

        HSSFSheet sheet = wb.createSheet("Category frequencies");

        // header
        row = sheet.createRow((short)0);
        for (int c=0; c<keys.length; c++){
          cell = row.createCell((short)(c+1));
          cell.setEncoding(HSSFCell.ENCODING_UTF_16);
          String nodepath = efm1.getEntryPath(keys[c]);
          cell.setCellValue(nodepath);
        }
        cell = row.createCell((short)(keys.length+1));
        cell.setEncoding(HSSFCell.ENCODING_UTF_16);
        cell.setCellValue("Total");

        int rownum = 1;
        for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          TokenList tl2 = tcache.getTokenList(d);
          if (tl2 == null)
            tl2 = TokenizationService.getTokenizationService().tokenize(d);
          Concordance conc = dict.getConcordance(tl2, catnode, wsize);
         
          // note _all_categories counted (implicitly around catnode matches)
          counts = getDocumentStats(d.getTitle(), conc, keys, dict.getDictionaryRoot());

          row = sheet.createRow((short)rownum);
          cell = row.createCell((short)0);
          cell.setEncoding(HSSFCell.ENCODING_UTF_16);
          cell.setCellValue(d.getTitle());

          for (int ii = 0; ii < keys.length; ii++) {
            cell = row.createCell((short)(ii+1));
            cell.setCellValue((double)counts[ii]);
          }
View Full Code Here

    final int wsize = winsize;
   
    tworker = new TaskWorker(yoshikoder){
      protected void doWork() throws Exception {
        // FIRST DOC
        YKDocument doc1 = (YKDocument)docs.get(0);
        // tokenize the document
        TokenizationCache tcache = yoshikoder.getTokenizationCache();
        TokenList tl1 = tcache.getTokenList(doc1);
        if (tl1 == null)
          tl1 = TokenizationService.getTokenizationService().tokenize(doc1);

        // compute the dictionary counts
        EntryFrequencyMap efm1 = new EntryFrequencyMap(dict.getDictionaryRoot(), tl1);
        List lkeys = efm1.getSortedCategoryEntries();
        Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
        int[] counts = new int[keys.length+1];
        for (int ii=0; ii<keys.length; ii++) {
          Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
          counts[ii] = cnt.intValue();
        }
        // add N
        counts[keys.length] = efm1.getTokenTotal();

        for (int ii = 0; ii < keys.length; ii++) {
          String nodepath = efm1.getEntryPath(keys[ii]);

          writer.write(",");
          writer.write(FileUtil.escapeForCsv(nodepath));
        }
        writer.write(",Total\n");

        // and the rest
        for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          TokenList tl2 = tcache.getTokenList(d);
          if (tl2 == null)
            tl2 = TokenizationService.getTokenizationService().tokenize(d);
          Concordance conc = dict.getConcordance(tl2, catnode, wsize);
         
          counts = getDocumentStats(d.getTitle(), conc, keys, dict.getDictionaryRoot());

          writer.write(FileUtil.escapeForCsv(d.getTitle()));
          for (int ii = 0; ii < keys.length; ii++) {
            writer.write("," + counts[ii]);
          }
          writer.write("," + counts[keys.length] + "\n");
        }
View Full Code Here

    public ExportDocumentAsUTF16Action(Yoshikoder yk) {
        super(yk, ExportDocumentAsUTF16Action.class.getName());
    }
   
    public void actionPerformed(ActionEvent e) {       
        final YKDocument doc = yoshikoder.getSelectedDocument();
        if (doc == null) return;
       
        if (documentExporter==null)
            documentExporter =
                DialogUtil.makeFileDialog(yoshikoder,
                        "Export Document as UTF-16",
                        FileDialog.SAVE, null); // TODO loc       
       
        documentExporter.setFile(null);
        documentExporter.show();
        String fname = documentExporter.getFile();
        if (fname == null) return;
       
        File filed = new File(documentExporter.getDirectory(), fname);
        final File file = FileUtil.suffix( filed, "txt");
        tworker = new TaskWorker(yoshikoder){
            protected void doWork() throws Exception {
                FileUtil.save(file, doc.getText(), "UTF-16");
            }
            protected void onError() {
                DialogUtil.yelp(yoshikoder, "Could not export the document", e); // TODO loc
            }
        };
View Full Code Here

    final CategoryNode catnode = node;
   
    tworker = new TaskWorker(yoshikoder){
            protected void doWork() throws Exception {
              // FIRST DOC
              YKDocument doc1 = (YKDocument)docs.get(0);
              // tokenize the document
            TokenizationCache tcache = yoshikoder.getTokenizationCache();
            TokenList tl1 = tcache.getTokenList(doc1);
                if (tl1 == null)
                  tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
                 YKDictionary dict = yoshikoder.getDictionary();
               
                 // compute the dictionary counts
                EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl1);
                List lkeys = efm1.getSortedCategoryEntries();
                Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
                int[] counts = new int[keys.length+1];
            for (int ii=0; ii<keys.length; ii++) {
              Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
              counts[ii] = cnt.intValue();
            }
            // add N
            counts[keys.length] = efm1.getTokenTotal();
             
            HSSFWorkbook wb = new HSSFWorkbook();
                HSSFRow row;
                HSSFCell cell;
               
                HSSFSheet sheet = wb.createSheet("Category frequencies");
               
                // header
                row = sheet.createRow((short)0);
                for (int c=0; c<keys.length; c++){
                  cell = row.createCell((short)(c+1));
                    cell.setEncoding(HSSFCell.ENCODING_UTF_16);
                    String nodepath = efm1.getEntryPath(keys[c]);
                    cell.setCellValue(nodepath);
                }
                cell = row.createCell((short)(keys.length+1));
                cell.setEncoding(HSSFCell.ENCODING_UTF_16);
                cell.setCellValue("Total");

                int rownum = 1;
                for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          counts = getDocumentStats(d, keys, catnode);

          row = sheet.createRow((short)rownum);
          cell = row.createCell((short)0);
                    cell.setEncoding(HSSFCell.ENCODING_UTF_16);
                    cell.setCellValue(d.getTitle());
         
          for (int ii = 0; ii < keys.length; ii++) {
            cell = row.createCell((short)(ii+1));
            cell.setCellValue((double)counts[ii]);
                  }
View Full Code Here

    final CategoryNode catnode = node;
   
    tworker = new TaskWorker(yoshikoder){
            protected void doWork() throws Exception {
              // FIRST DOC
              YKDocument doc1 = (YKDocument)docs.get(0);
              // tokenize the document
            TokenizationCache tcache = yoshikoder.getTokenizationCache();
            TokenList tl1 = tcache.getTokenList(doc1);
                if (tl1 == null)
                  tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
                 YKDictionary dict = yoshikoder.getDictionary();
               
                 // compute the dictionary counts
                EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl1);
                List lkeys = efm1.getSortedCategoryEntries();
                Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
                int[] counts = new int[keys.length+1];
            for (int ii=0; ii<keys.length; ii++) {
              Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
              counts[ii] = cnt.intValue();
            }
            // add N
            counts[keys.length] = efm1.getTokenTotal();
             
            for (int ii = 0; ii < keys.length; ii++) {
          String nodepath = efm1.getEntryPath(keys[ii]);
         
          writer.write(",");
          writer.write(FileUtil.escapeForCsv(nodepath));
              }
            writer.write(",Total\n");
       
            // and the rest
            for (Iterator iter = docs.iterator(); iter.hasNext();) {
          YKDocument d = (YKDocument) iter.next();
          counts = getDocumentStats(d, keys, catnode);

          writer.write(FileUtil.escapeForCsv(d.getTitle()));
                for (int ii = 0; ii < keys.length; ii++) {
            writer.write("," + counts[ii]);
                  }
                writer.write("," + counts[keys.length] + "\n");
        }
View Full Code Here

 
  abstract public void reactToDoubleClickedConcordanceLine();
 
  public static void main(String[] args) throws Exception{
   
    YKDocument doc1 = YKDocumentFactory.createYKDocument(new File("/Users/will/Desktop/bundestagdebate/14235.txt"),
        "doc1", "UTF-8", Locale.GERMAN);
    YKDocument doc2 = YKDocumentFactory.createYKDocument(new File("/Users/will/Desktop/bundestagdebate/14235.txt"),
        "doc2", "UTF-8", Locale.GERMAN);
    TokenStructuredDocument tsdoc1 = new TokenStructuredDocument(doc1);
    TokenStructuredDocument tsdoc2 = new TokenStructuredDocument(doc2);
    String[][] conc1 = tsdoc1.getPatternConcordance(Pattern.compile("vier"), 5);
    String[][] conc2 = tsdoc2.getPatternConcordance(Pattern.compile("vier"), 5);
View Full Code Here

    System.err.println("Make character span [" + span[0] + ", " + span[1] + "] in document '" + doc.getTitle() + "' visible");
  }
 
  public static void main(String[] args) throws IOException {

    YKDocument doc1 = YKDocumentFactory.createYKDocument(new File("/Users/will/Documents/german-manifestos-econ/ORIG_ECON_CDU1990.txt"),
        "CDU91990", "ISO-8859-1", Locale.GERMAN);
    YKDocument doc2 = YKDocumentFactory.createYKDocument(new File("/Users/will/Documents/german-manifestos-econ/ORIG_ECON_CDU1994.txt"),
        "CDU91994", "ISO-8859-1", Locale.GERMAN);

    TokenStructuredDocument tsd1 = new TokenStructuredDocument(doc1);
    TokenStructuredDocument tsd2 = new TokenStructuredDocument(doc2);
View Full Code Here

    else
      return "";
  }

  public static void main(String[] args) throws IOException {
    YKDocument d1 = YKDocumentFactory.createDummyDocument("dummy1", "1 2 3 4 5 6 7 8 9", "UTF-8");
    YKDocument d2 = YKDocumentFactory.createDummyDocument("dummy2", "5 6 7 8 9 10 11 12", "UTF-8");
    TokenStructuredDocument tsd1 = new TokenStructuredDocument(d1);
    TokenStructuredDocument tsd2 = new TokenStructuredDocument(d2);
    Pattern[] p5 = new Pattern[]{Pattern.compile("9")};
    Pattern[] p34 = new Pattern[]{Pattern.compile("2"), Pattern.compile("3"), Pattern.compile("4")};
    List<TokenStructuredDocument> dlist = new ArrayList<TokenStructuredDocument>();
View Full Code Here

TOP

Related Classes of edu.harvard.wcfia.yoshikoder.document.YKDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.