Package uk.ac.cam.ha293.tweetlabel.twitter

Source Code of uk.ac.cam.ha293.tweetlabel.twitter.SimpleProfile

package uk.ac.cam.ha293.tweetlabel.twitter;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import uk.ac.cam.ha293.tweetlabel.types.Document;
import uk.ac.cam.ha293.tweetlabel.classify.AlchemyClassification;
import uk.ac.cam.ha293.tweetlabel.classify.AlchemyClassifier;
import uk.ac.cam.ha293.tweetlabel.classify.CalaisClassification;
import uk.ac.cam.ha293.tweetlabel.classify.CalaisClassifier;
import uk.ac.cam.ha293.tweetlabel.classify.FullAlchemyClassification;
import uk.ac.cam.ha293.tweetlabel.classify.FullCalaisClassification;
import uk.ac.cam.ha293.tweetlabel.classify.FullTextwiseClassification;
import uk.ac.cam.ha293.tweetlabel.classify.TextwiseClassification;
import uk.ac.cam.ha293.tweetlabel.classify.TextwiseClassifier;
import uk.ac.cam.ha293.tweetlabel.liwc.FullLIWCClassification;
import uk.ac.cam.ha293.tweetlabel.liwc.LIWCDictionary;
import uk.ac.cam.ha293.tweetlabel.types.Category;
import uk.ac.cam.ha293.tweetlabel.types.CategoryScore;
import uk.ac.cam.ha293.tweetlabel.types.Corpus;
import uk.ac.cam.ha293.tweetlabel.util.Tools;
import cc.mallet.types.Instance;

public class SimpleProfile implements Serializable {

  private static final long serialVersionUID = 4610549868654867470L;
 
  //private static LIWCDictionary liwc = LIWCDictionary.loadDictionaryFromFile("liwc/dictionary.obj");
  private long userID;
  private List<SimpleTweet> tweets;
 
  public SimpleProfile(long userID) {
    this.userID = userID;
    tweets = new ArrayList<SimpleTweet>();
  }
 
  public long getUserID() {
    return userID;
  }
 
  public void addTweet(SimpleTweet tweet) {
    tweets.add(tweet);
  }
 
  public void addTweets(List<SimpleTweet> tweetList) {
    tweets.addAll(tweetList);
  }
 
  public List<SimpleTweet> getTweets() {
    return tweets;
  }
 
  public void print() {
    System.out.println("Twitter Profile for "+userID);
    for(SimpleTweet tweet : tweets) {
      tweet.print();
    }
  }
 
  public void printUrls() {
    List<String> urls = new ArrayList<String>();
    for(SimpleTweet tweet : tweets) {
      urls.addAll(tweet.getUrls());
    }
    System.out.println("Urls Used: ");
    for(String url : urls) {
      System.out.println(url);
    }
  }
 
  public void printHashtags() {
    List<String> hashtags = new ArrayList<String>();
    for(SimpleTweet tweet : tweets) {
      hashtags.addAll(tweet.getHashtags());     
    }
    System.out.println("Hashtags Used: ");
    for(String hashtag : hashtags) {
      System.out.println(hashtag);
    }
  }
 
  public void printStripped() {
    System.out.println("Twitter Profile for "+userID);
    for(SimpleTweet tweet : tweets) {
      tweet.printStripped();
    }
  }
 
  public void save() {
    try {
      String profileFilename = "profiles/simple/"+userID+".sprofile";
      FileOutputStream fileOut = new FileOutputStream(profileFilename);
      ObjectOutputStream objectOut = new ObjectOutputStream(fileOut);
      objectOut.writeObject(this);
      objectOut.close();
      System.out.println("Saved simple profile for "+userID);
    } catch (FileNotFoundException e) {
      System.out.println("Couldn't save simple profile for "+userID);
      e.printStackTrace();
    } catch (IOException e) {
      System.out.println("Couldn't save simple profile for "+userID);
      e.printStackTrace();     
    }
  }
 
  public void saveCSV() {
    try {
      String profileFilename = "profiles/csv/"+userID+".csv";
      FileOutputStream fileOut = new FileOutputStream(profileFilename);
      PrintWriter writeOut = new PrintWriter(fileOut);
      writeOut.println("\"userid\",\"tweetid\",\"text\"");
      for(SimpleTweet tweet : tweets) {
        writeOut.print(tweet.getUserID()+",");
        writeOut.print(tweet.getTweetID()+",");
        writeOut.print("\""+tweet.getText()+"\"");
        writeOut.println();
      }
      writeOut.close();
      System.out.println("Saved CSV profile for "+userID);
    } catch (IOException e) {
      System.out.println("Couldn't save CSV profile for "+userID);
      e.printStackTrace();
    }
  }
 
  /*
  //NOTE: We perform stripping while converting a profile to a Document Set
  //NOTE: NEED TO MEMOISE CLASSIFICATIONS
  public Set<Document> asDocumentSet(String topicType) {
    System.out.println("Converting profile to Document set...");
    Set<Document> documents = new HashSet<Document>();
    Map<Long,List<CategoryScore>> scores = null; //for LIWC
    int count = 0;
    for(SimpleTweet tweet : tweets) {
      if(count % 50 == 0) System.out.println(count+"/"+tweets.size()+" tweets converted...");
      count++;
      String strippedText = Tools.stripTweet(tweet.getText());
      //String strippedText = Tools.stripTweetVerbose(tweet.getText());
      Set<String> topics = new HashSet<String>();
      Document document = null;
      if(topicType != null) {
        //NOTE: We classify on the TWEET, not on the stemmed/stripped version!
        if(topicType.equals("liwc")) {
          if(scores == null) scores = liwc.getLIWCCategoryScores(userID);
          //List<CategoryScore> liwcClassifications = liwc.classifyTweet(tweet); //NOTE: this is already sorted by LIWCDictionary
          List<CategoryScore> liwcClassifications = scores.get(tweet.getTweetID());
          //Take the top 3... note that this is of limited use, I just need some topics to get started
          int getTop = 3;
          if(getTop > liwcClassifications.size()) getTop = liwcClassifications.size();
          for(int i=0; i<getTop; i++) {
            topics.add(liwcClassifications.get(i).getCategory().getTitle());
          }
          document = new Document(strippedText,tweet.getTweetID(),topics);   
        }
       
        else if(topicType.equals("liwcnb")) {
          if(scores == null) scores = liwc.getLIWCCategoryScoresNB(userID);
          //List<CategoryScore> liwcClassifications = liwc.classifyTweetNaiveBayes(tweet); //NOTE: this is already sorted by LIWCDictionary
          List<CategoryScore> liwcClassifications = scores.get(tweet.getTweetID());
          //Take the top 3... note that this is of limited use, I just need some topics to get started
          int getTop = 3;
          if(getTop > liwcClassifications.size()) getTop = liwcClassifications.size();
          for(int i=0; i<getTop; i++) {
            topics.add(liwcClassifications.get(i).getCategory().getTitle());
          }
          document = new Document(strippedText,tweet.getTweetID(),topics); 
        }
      } else {
        document = new Document(strippedText,tweet.getTweetID());
      }
      documents.add(document);
    }
    return documents;
  }
 
  public Set<Document> asDocumentSet() {
    return asDocumentSet(null);
  }
 
 
  public Corpus asCorpus(String topicType) {
    Corpus corpus = new Corpus(this.asDocumentSet(topicType),topicType);
    return corpus;
  }
 
  public Corpus asCorpus() {
    Corpus corpus = new Corpus(this.asDocumentSet(null));
    return corpus;
  }
  */

  /* NOTE Details of weighted alchemy classification here:
   * Rather than counting instances of each different classification
   * by adding 1 every time a category appears, instead the score
   * of that category (between 0.0 and 1.0) will be added to that
   * category's count - this way a weighted count can be achieved.
   * Two seperate counts are found - text count (based on tweet content)
   * and URL count (based on the classifications of all the URLs of a
   * profile. String categories therefore form the keys of the count
   * hashmaps. 
   *
   * Also NOTE: this method can take AGES
   *
   * Also NOTE: this method ONLY SAVES A CATEGORY into teh csv if it appears
   * in the profile or its URLS. it doesn't have a comprehensive list of all alchemy cats.
   */
  public boolean classifyAlchemy() {
   
    System.out.println("Alchemy classifying profile "+userID);
   
    //Check if classification already exists, to save time and allow resuming classification if something goes wrong
    String classificationFilename = "classifications/alchemy/"+userID+".csv";
    File classificationFile = new File(classificationFilename);
    if(classificationFile.exists()) {
      System.out.println("Alchemy classification already exists for "+userID+", aborting");
      return true; //or do we want to return false...
    }
   
    FileOutputStream fileOutTweets = null;
    PrintWriter writeOutTweets = null;
    try {
      String tweetsFilename = "classifications/alchemy/tweets/"+userID+".csv";
      fileOutTweets = new FileOutputStream(tweetsFilename);
      writeOutTweets = new PrintWriter(fileOutTweets);
      writeOutTweets.println("\"tweetID\",\"category\",\"score\",\"urlcategories\",\"urlscores\"");
    } catch(IOException e) {
      System.err.println("Couldn't open tweets file to save to");
    }
   
    Map<String,Double> textCategoryScores = new HashMap<String,Double>();
    Map<String,Integer> textCategoryCounts = new HashMap<String,Integer>();
    Map<String,Double> urlCategoryScores = new HashMap<String,Double>();
    Map<String,Integer> urlCategoryCounts = new HashMap<String,Integer>();
   
    for(SimpleTweet tweet : tweets) {
      //Handle tweet content classification
      AlchemyClassification textClassification = AlchemyClassifier.classifyText(tweet.getText());
      if(textClassification == null) {
        System.err.println("Daily Transaction Limit has been reached (or a null classification was given weirdly)");
        return false;
      }
     
      writeOutTweets.print(tweet.getTweetID()+","+textClassification.getCategory()+","+textClassification.getScore());
      System.err.println("JUST WROTE TWEEET OUT");
     
      if(textCategoryScores.containsKey(textClassification.getCategory())) {
        //Add the new value, since the key already exists
        textCategoryScores.put(textClassification.getCategory(), textCategoryScores.get(textClassification.getCategory()) + textClassification.getScore());
        textCategoryCounts.put(textClassification.getCategory(), textCategoryCounts.get(textClassification.getCategory()) + 1);
      } else {
        //Insert the category into the map
        textCategoryScores.put(textClassification.getCategory(), textClassification.getScore());
        textCategoryCounts.put(textClassification.getCategory(), 1);
      }
     
      //Handle URL classification
      for(String url : tweet.getUrls()) {
        AlchemyClassification urlClassification = AlchemyClassifier.classifyURL(url);
        if(urlClassification == null) {
          System.err.println("Daily Transaction Limit has been reached (or a null classification was given weirdly)");
          return false;
        }
       
        writeOutTweets.print(","+urlClassification.getCategory()+","+urlClassification.getScore());
       
        if(urlCategoryScores.containsKey(urlClassification.getCategory())) {
          //Add the new value, since the key already exists
          urlCategoryScores.put(urlClassification.getCategory(), urlCategoryScores.get(urlClassification.getCategory()) + urlClassification.getScore());
          urlCategoryCounts.put(urlClassification.getCategory(), urlCategoryCounts.get(urlClassification.getCategory()) + 1);
        } else {
          //Insert the category into the map
          urlCategoryScores.put(urlClassification.getCategory(), urlClassification.getScore());
          urlCategoryCounts.put(urlClassification.getCategory(), 1);
        }
      }

      //writeOutTweets.println();
    }
   
    writeOutTweets.close();
   
    //Now save as a CSV Alchemy classification file containing the scores
    try {
      FileOutputStream fileOut = new FileOutputStream(classificationFilename);
      PrintWriter writeOut = new PrintWriter(fileOut);
      writeOut.println("\"category\",\"textscore\",\"textcount\",\"urlscore\",\"urlcount\"");
     
      //Need to merge the category sets from both text and URL classifications
      //in case one has a category the other does not have
 
      Set<String> categorySet = new HashSet<String>(textCategoryScores.keySet());
      categorySet.addAll(urlCategoryScores.keySet());
     
      //remove the useless categories
      categorySet.remove(null); //you can have a null key? weird
      categorySet.remove("unknown");
     
      for(String category : categorySet) {       
        Double textscore = textCategoryScores.get(category);
        Integer textcount = textCategoryCounts.get(category);
        Double urlscore = urlCategoryScores.get(category);
        Integer urlcount = urlCategoryCounts.get(category);
       
        //If no score exists for a category, it has not appeared in a profile
        if(textscore == null) {
          textscore = new Double(0.0);
          textcount = new Integer(0);
        }
        if(urlscore == null) {
          urlscore = new Double(0.0);
          urlcount = new Integer(0);
        }
       
        writeOut.print("\""+category+"\",");
        writeOut.print(textscore+",");
        writeOut.print(textcount+",");
        writeOut.print(urlscore+",");
        writeOut.print(urlcount);
        writeOut.println();
      }
      writeOut.close();
      System.out.println("Saved CSV Alchemy classification for "+userID);
      return true;
    } catch (IOException e) {
      System.out.println("Couldn't save CSV Alchemy classification for "+userID);
      e.printStackTrace();
      return false;
    }
  }
 
  /*
  //NOTE: Incorporates both naive frequency counting and naive bayes, stores both
  public boolean classifyLIWC() {
    System.out.println("LIWC classifying profile "+userID);
   
    //Check if classification already exists, to save time and allow resuming classification if something goes wrong
    String classificationFilename = "classifications/liwc/"+userID+".csv";
    String classificationFilenameCount = "classifications/liwc/count/"+userID+".csv";
    String classificationFilenameNB = "classifications/liwc/nb/"+userID+".csv";
    File classificationFileCount = new File(classificationFilenameCount);
    File classificationFileNB = new File(classificationFilenameNB);
    if(classificationFileCount.exists() || classificationFileNB.exists()) {
      System.out.println("LIWC classification already exists for "+userID+", aborting");
      return true; //or do we want to return false...
    }
             
    //Now save as a CSV LIWC classification file containing the scores
    try {
      FileOutputStream fileOut = new FileOutputStream(classificationFilename);
      FileOutputStream fileOutCount = new FileOutputStream(classificationFilenameCount);
      FileOutputStream fileOutNB = new FileOutputStream(classificationFilenameNB);
      PrintWriter writeOut= new PrintWriter(fileOut);
      PrintWriter writeOutCount = new PrintWriter(fileOutCount);
      PrintWriter writeOutNB = new PrintWriter(fileOutNB);
      writeOut.println("\"category\",\"countscore\",\"nbscore\"");
      writeOutCount.println("\"tweetid\",\"category\",\"countscore\"");
      writeOutNB.println("\"tweetid\",\"category\",\"nbscore\"");
     
      //To speed up full-profile classification
      Map<Category,Double> scoresNormal = new HashMap<Category,Double>();
      Map<Category,Double> scoresNaiveBayes = new HashMap<Category,Double>();
   
      //Individual tweet classifications
      int averageCount = 0;
      for(SimpleTweet tweet : tweets) {
        List<CategoryScore> scoresCount = liwc.classifyTweet(tweet);
        List<CategoryScore> scoresNB = liwc.classifyTweetNaiveBayes(tweet);
        Collections.sort(scoresCount);
        Collections.sort(scoresNB);
        Collections.reverse(scoresCount);
        Collections.reverse(scoresNB);
       
        writeOutCount.print(tweet.getTweetID()+",");
        String outString = "";
        for(CategoryScore score : scoresCount) {
          //update counts for full-profile
          if(scoresNormal.containsKey(score.getCategory())) {
            scoresNormal.put(score.getCategory(),scoresNormal.get(score.getCategory())+score.getScore());
          } else {
            scoresNormal.put(score.getCategory(),score.getScore());
          }
         
          outString += (score.getCategory().getTitle()+","+score.getScore()+","); 
        }
        if(!outString.isEmpty()) outString.subSequence(0, outString.length()-1);
        writeOutCount.print(outString);
        writeOutCount.println();
       
        writeOutNB.print(tweet.getTweetID()+",");
        outString = "";
        for(CategoryScore score : scoresNB) {
          //update counts for full-profile
          if(scoresNaiveBayes.containsKey(score.getCategory())) {
            //Moving average - hacky...
            scoresNaiveBayes.put(score.getCategory(),((scoresNaiveBayes.get(score.getCategory())*averageCount)+score.getScore()) / (averageCount+1));
          } else {
            scoresNaiveBayes.put(score.getCategory(),score.getScore());
          }
         
          outString += (score.getCategory().getTitle()+","+score.getScore()+","); 
        }
        if(!outString.isEmpty()) outString.subSequence(0, outString.length()-1);
        writeOutNB.print(outString);
        writeOutNB.println();
        averageCount++;
      }
      writeOutCount.close();
      writeOutNB.close();
     
      Set<Category> fullCatSet = new HashSet<Category>();
      fullCatSet.addAll(scoresNormal.keySet());
      fullCatSet.addAll(scoresNaiveBayes.keySet());
      for(Category cat : fullCatSet) {
        Double normalScore = scoresNormal.get(cat);
        if(normalScore == null) normalScore = 0.0;
        writeOut.println(cat.getTitle()+","+normalScore+","+scoresNaiveBayes.get(cat));
      }
      writeOut.close();
     
      System.out.println("Saved CSV LIWC classification for "+userID);
      return true;
    } catch (IOException e) {
      System.out.println("Couldn't save CSV LIWC classification for "+userID);
      e.printStackTrace();
      return false;
    }
  }
  */
 
  //Performs the same weighted sum as the Alchemy classification does
  //ie sums the scores for all categories over all tweets.
  public boolean classifyCalais() {
    System.out.println("Calais classifying profile "+userID);
   
    //Check if classification already exists, to save time and allow resuming classification if something goes wrong
    String classificationFilename = "classifications/calais/"+userID+".csv";
    File classificationFile = new File(classificationFilename);
    if(classificationFile.exists()) {
      System.out.println("Calais classification already exists for "+userID+", aborting");
      return true; //or do we want to return false...
    }
   
    Map<String,Double> resultsMap = new HashMap<String,Double>();
    Map<String,Integer> countsMap = new HashMap<String,Integer>();
   
    for(SimpleTweet tweet : tweets) {
      CalaisClassification classification = CalaisClassifier.classifyText(tweet.getText());
      if(classification == null) {
        //Normally this is because of a <100 char tweet
        continue;
      }
      for(String category : classification.getCategories()) {
        double score = classification.lookupScore(category);     
        if(resultsMap.containsKey(category)) {
          resultsMap.put(category, resultsMap.get(category) + score);
          countsMap.put(category, countsMap.get(category) + 1);
        } else {
          resultsMap.put(category, score);
          countsMap.put(category, 1);
        }
      }
    }
             
    //Now save as a CSV Calais classification file containing the scores
    try {
      FileOutputStream fileOut = new FileOutputStream(classificationFilename);
      PrintWriter writeOut = new PrintWriter(fileOut);
      writeOut.println("\"category\",\"score\",\"countscore\"");
   
      for(String category : resultsMap.keySet()) {
        writeOut.print("\""+category+"\",");
        writeOut.print(resultsMap.get(category)+",");
        writeOut.print(countsMap.get(category));
        writeOut.println();
      }
      writeOut.close();
      System.out.println("Saved CSV Calais classification for "+userID);
      return true;
    } catch (IOException e) {
      System.out.println("Couldn't save CSV Calais classification for "+userID);
      e.printStackTrace();
      return false;
    }
  }
 
  public boolean classifyTextwise() {
   
    System.out.println("Textwise classifying profile "+userID);
   
    //Check if classification already exists, to save time and allow resuming classification if something goes wrong
    String classificationFilename = "classifications/textwise/"+userID+".csv";
    File classificationFile = new File(classificationFilename);
    if(classificationFile.exists()) {
      System.out.println("Textwise classification already exists for "+userID+", aborting");
      return true; //or do we want to return false...
    }
   
    Map<String,Double> textCategoryScores = new HashMap<String,Double>();
    Map<String,Integer> textCategoryCounts = new HashMap<String,Integer>();
    Map<String,Double> urlCategoryScores = new HashMap<String,Double>();
    Map<String,Integer> urlCategoryCounts = new HashMap<String,Integer>();
   
    for(SimpleTweet tweet : tweets) {
      //System.err.println(tweet.getText());
     
      TextwiseClassification textClassification = TextwiseClassifier.classify(tweet.getText(), false);
     
      textClassification.print();
     
      for(String category : textClassification.getCategories()) {
        if(textCategoryScores.containsKey(category)) {
          textCategoryScores.put(category, textCategoryScores.get(category) + textClassification.lookupScore(category));
          textCategoryCounts.put(category, textCategoryCounts.get(category) + 1);
        } else {
          textCategoryScores.put(category, textClassification.lookupScore(category));
          textCategoryCounts.put(category, 1);
        }
      }
     
      for(String url : tweet.getUrls()) {
       
        //System.err.println(url);
       
        TextwiseClassification urlClassification = TextwiseClassifier.classify(url, true);
       
        urlClassification.print();
       
        for(String category : urlClassification.getCategories()) {
          if(urlCategoryScores.containsKey(category)) {
            urlCategoryScores.put(category, urlCategoryScores.get(category) + urlClassification.lookupScore(category));
            urlCategoryCounts.put(category, urlCategoryCounts.get(category) + 1);
          } else {
            urlCategoryScores.put(category, urlClassification.lookupScore(category));
            urlCategoryCounts.put(category, 1);
          }
        }
      }
     
    }
   
    //Now save as a CSV Textwise classification file containing the scores
    try {
      FileOutputStream fileOut = new FileOutputStream(classificationFilename);
      PrintWriter writeOut = new PrintWriter(fileOut);
      writeOut.println("\"category\",\"textscore\",\"textcount\",\"urlscore\",\"urlcount\"");
     
      //Need to merge the category sets from both text and URL classifications
      //in case one has a category the other does not have
 
      Set<String> categorySet = new HashSet<String>(textCategoryScores.keySet());
      categorySet.addAll(urlCategoryScores.keySet());
     
      //remove the useless categories
      //categorySet.remove(null); //you can have a null key? weird
     
      for(String category : categorySet) {       
        Double textscore = textCategoryScores.get(category);
        Integer textcount = textCategoryCounts.get(category);
        Double urlscore = urlCategoryScores.get(category);
        Integer urlcount = urlCategoryCounts.get(category);
       
        //If no score exists for a category, it has not appeared in a profile
        if(textscore == null) {
          textscore = new Double(0.0);
          textcount = new Integer(0);
        }
        if(urlscore == null) {
          urlscore = new Double(0.0);
          urlcount = new Integer(0);
        }
       
        writeOut.print("\""+category+"\",");
        writeOut.print(textscore+",");
        writeOut.print(textcount+",");
        writeOut.print(urlscore+",");
        writeOut.print(urlcount);
        writeOut.println();
      }
      writeOut.close();
      System.out.println("Saved CSV Textwise classification for "+userID);
      return true;
    } catch (IOException e) {
      System.out.println("Couldn't save CSV Textwise classification for "+userID);
      e.printStackTrace();
      return false;
    }
   
  }
 
  public boolean classifyTextwiseProper() {
   
    System.out.println("Textwise classifying profile "+userID);
   
    //Check if classification already exists, to save time and allow resuming classification if something goes wrong
    String classificationFilename = "classifications/textwiseproper/"+userID+".csv";
    File classificationFile = new File(classificationFilename);
    if(classificationFile.exists()) {
      System.out.println("Textwise classification already exists for "+userID+", aborting");
      return true; //or do we want to return false...
    }
   
    //Concatenate tweets
    String concat = "";
    for(SimpleTweet tweet : tweets) {
      concat += tweet.getText();
    }
   
    Map<String,Double> classifications = new HashMap<String,Double>();
    Map<String,Integer> classificationsCount = new HashMap<String,Integer>();
    for(int i=0; i<concat.length()-500; i+=500) {
      TextwiseClassification textClassification;
      if(i+500>=concat.length()) textClassification = TextwiseClassifier.classify(concat.substring(i,concat.length()), false);
      else textClassification = TextwiseClassifier.classify(concat.substring(i,i+500), false)
      Map<String,Double> scores = textClassification.getCategoryScores();
      for(String cat : textClassification.getCategories()) {
        if(classifications.containsKey(cat)) {
          classifications.put(cat,classifications.get(cat)+scores.get(cat));
          classificationsCount.put(cat,classificationsCount.get(cat)+1);
        } else {
          classifications.put(cat,scores.get(cat));
          classificationsCount.put(cat,1);
        }
      }
    }
   
    //Now save as a CSV Textwise classification file containing the scores
    try {
      FileOutputStream fileOut = new FileOutputStream(classificationFilename);
      PrintWriter writeOut = new PrintWriter(fileOut);
      writeOut.println("\"category\",\"score\",\"count\"");
     
      //remove the useless categories
      //categorySet.remove(null); //you can have a null key? weird
      for(String category : classifications.keySet()) {       
        writeOut.println("\""+category+"\","+classifications.get(category)+","+classificationsCount.get(category));
      }
      writeOut.close();
      System.out.println("Saved CSV Textwise classification for "+userID);
      return true;
    } catch (IOException e) {
      System.out.println("Couldn't save CSV Textwise classification for "+userID);
      e.printStackTrace();
      return false;
    }
   
  }
 
  //Concatenate all the tweets and topics
  //NOTE: This basically is only for LIWC - can be used later if we want individual tweet stuff
  public Document asDocument(String topicType) {
    if(topicType != null) {
      if(topicType.equals("alchemy")) { //TODO: Hacky...
        return asLLDADocument("alchemy");
      }
      if(topicType.equals("calais")) {
        return asLLDADocument("calais");
      }
      if(topicType.equals("textwise")) {
        return asLLDADocument("textwise");
      }
      if(topicType.equals("textwiseproper")) {
        return asLLDADocument("textwiseproper");
      }
      if(topicType.equals("liwc")) {
        return asLLDADocument("liwc");
      }
      if(topicType.equals("liwcnb")) {
        return asLLDADocument("liwcnb");
      }
    }
   
   
    //Concatenate all tweets
    String concat = "";
    for(SimpleTweet tweet : tweets) {
      concat += tweet.getText()+" ";
    }
    String strippedText = Tools.stripTweet(concat);
    Document document = new Document(strippedText, userID);
    return document;
  }
 
  public void reduceBy(int reduction) {
    if(reduction > 9) return;
    int newSize = (int)Math.ceil(tweets.size() * (1.0 - reduction/10.0));
    List<SimpleTweet> newTweets = new ArrayList<SimpleTweet>();
    Collections.shuffle(tweets);
    for(int i=0; i<newSize; i++) {
      newTweets.add(tweets.get(i));
    }
    tweets = newTweets;
  }
 
  //For use in Corpus' getFullProfileCorpus(topicType) method
  public Document asLLDADocument(String topicType) {
    Set<String> topics = new HashSet<String>();
    //Note: if not LIWC or LIWCNB, we have no topics yet!
    if(topicType.equals("alchemy")) {
      FullAlchemyClassification fac = new FullAlchemyClassification(userID);
      int topTopics = 3;
      //alchemy too sparse to threshold
      int count = 0;
      for(String topic : fac.getCategorySet()) {
        if(count == topTopics) break; //stop getting more than 3 topics
        //if(fac.getScore(topic) < scoreThreshold) break; //stop getting low-prob topics
        topics.add(topic);
        count++;
      }
    } else if(topicType.equals("calais")) {
      FullCalaisClassification fcc = new FullCalaisClassification(userID);
      int topTopics = 3;
      int count = 0;
      for(String topic : fcc.getCategorySet()) {
        if(topic.equals("Other")) continue; //really prominent...
        if(count == topTopics) break; //stop getting more than 3 topics
        //if(fac.getScore(topic) < scoreThreshold) break; //stop getting low-prob topics
        topics.add(topic);
        count++;
      }
    } else if(topicType.equals("textwise")) {
      FullTextwiseClassification ftc = new FullTextwiseClassification(userID,false);
      int topTopics = 3;
      int count = 0;
      for(String topic : ftc.getCategorySet()) {
        if(count == topTopics) break; //stop getting more than 3 topics
        //if(fac.getScore(topic) < scoreThreshold) break; //stop getting low-prob topics
        topics.add(topic);
        count++;
      }
    } else if(topicType.equals("textwiseproper")) {
      FullTextwiseClassification ftc = new FullTextwiseClassification(userID,true);
      int topTopics = 3;
      int count = 0;
      for(String topic : ftc.getCategorySet()) {
        if(count == topTopics) break; //stop getting more than 3 topics
        //if(fac.getScore(topic) < scoreThreshold) break; //stop getting low-prob topics
        topics.add(topic);
        count++;
      }
    }else if(topicType.equals("liwc")) {
      FullLIWCClassification flc = new FullLIWCClassification(false,userID);
      int topTopics = 3;
      int count = 0;
      for(String topic : flc.getCategorySet()) {
        if(count == topTopics) break;
        topics.add(topic);
        count++;
      }
    } else if(topicType.equals("liwcnb")) {
      FullLIWCClassification flc = new FullLIWCClassification(true,userID);
      int topTopics = 3;
      double threshold = 0.2;
      int count = 0;
      for(String topic : flc.getCategorySet()) {
        if(count == topTopics) break;
        if(flc.getScore(topic) < threshold) break;
        topics.add(topic);
        count++;
      }
    } else {
      System.out.println("invalid topic type");
      return null;
    }
   
    //Concatenate all tweets
    String concat = "";
    for(SimpleTweet tweet : tweets) {
      concat += tweet.getText()+" ";
    }
    String strippedText = Tools.stripTweet(concat);
    Document document = new Document(strippedText, userID, topics);
    return document;
  }
 
  public Document asDocument() {
    return asDocument(null);
  }
}
TOP

Related Classes of uk.ac.cam.ha293.tweetlabel.twitter.SimpleProfile

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.