Corpus corpus = null;
if(stem) corpus = Corpus.load("allprofiles-stemmed");
else corpus = Corpus.load("allprofiles-unstemmed");
//Check for model existence
LDATopicModel lda = null;
if(new File("models/lda/"+description+".model").exists()) {
System.out.println("Found LDA model "+description);
lda = LDATopicModel.load(description);
} else {
System.out.println("Couldn't find LDA model "+description+", creating new one");
lda = new LDATopicModel(corpus,numTopics,burn,sample,0,alpha,0.01);
lda.runGibbsSampling();
lda.save(description);
}
try {
//Get the document topic distributions and store these
List<List<WordScore>> docTopics = lda.getDocuments();
int docID = 0;
for(List<WordScore> document : docTopics) {
Long userID = lda.getDocIDFromIndex(docID);
FileOutputStream fileOut = new FileOutputStream(dirName+"/"+userID+".csv");
PrintWriter writeOut = new PrintWriter(fileOut);
writeOut.println("\"topic\",\"probability\"");
for(WordScore topic : document) {
writeOut.println(topic.getWord()+","+topic.getScore());
}
writeOut.close();
docID++;
}
//NOTE: We are saving these for now. However, we always have a saved model
//and we can get these attributes from the model
//should also save the topic-word distributions
//okay, so we should definitely serialize topics and vocab
Map<String,Integer> vocab = lda.getVocab();
double[][] topics = lda.getTopicsUnsorted();
//Save topics
FileOutputStream topicsFileOut = new FileOutputStream(dirName+"/TOPICS.obj");
ObjectOutputStream topicsObjectOut = new ObjectOutputStream(topicsFileOut);
topicsObjectOut.writeObject(topics);