## Rich Nielsen
## this version: 31 oct 2016, first version: 26 feb 2014
## A text analysis illustration.
## Preperation time: 10 hours (5 wrangling text and 5 on everything else)

## clear the workspace, just in case
rm(list=ls())

## Set the working dir (CHANGE TO YOUR OWN WORKING DIRECTORY HERE)
mydir <- "C:/Users/Richard Nielsen/Desktop/Professional Stuff/talks/text analysis talk/"
setwd(mydir)

## These are some libraries we will use later
library(tm)  
library(topicmodels)
library(lda)
library(wordcloud)

## check package versions for reproducibility later
packageVersion("tm") # 0.6.2
packageVersion("topicmodels") # 0.2.4
packageVersion("lda") # 1.4.2
packageVersion("wordcloud")  # 2.5

## This is a complete example -- I start by scraping the IQSS blog and then
## do both unsupervised and supervised text analysis on the blog posts.
## UPDATE 4/20/2015. THE IQSS BLOG HAS BEEN MOVED:
## http://projects.iq.harvard.edu/sss_blog/
## Unfortunately, this means that the webscraping example code is now broken.
##  I've commented it out below.


###############################################################
## Getting the text
###############################################################

## This was a failed scraper -- it didn't work because some months are missing!!
## That meant it collected multiples of the same blog post
## I didn't find it until the text analysis stage, so I'm leaving it in as a lesson.
## collect the html from the harvard SSS blog
##
##baseurl <- "http://www.iq.harvard.edu/blog/sss/archives/"
##urlparts1 <- as.character(2005:2013)
##urlparts2 <- c("01","02","03","04","05","06","07","08","09","10","11","12")
##
##for(i in 1:length(urlparts1)){
##  for(j in 1:length(urlparts2)){
##    ## skip the months before the blog existed
##    if(i==1 & j < 9){next}
##    ## skip the months after the archives end
##    if(urlparts1[i]=="2013" & j > 4){next}
##    currenturl <- paste(baseurl,urlparts1[i],"/",urlparts2[j],sep="")
##    try(dat <- readLines(currenturl))
##    currentpath <- paste("sss_html/",urlparts1[i],"_",urlparts2[j],".htm",sep="")
##    write(dat,currentpath)
##  }
##}

## REDO THE SCRAPING SO THAT I'M ONLY PULLING ARCHIVES THAT EXIST

## visit the page and get the links to the archives
#page <- readLines("http://blogs.iq.harvard.edu/sss/")
## grab just the links I want
#page <- page[grep("<h3>Archives</h3>",page):grep("<h3>Notification</h3>",page)]
#page <- page[grep("http://www.iq.harvard.edu/blog/sss/archives/",page)]
#links <- gsub("<li><a href=\"","",page)
#links <- gsub("\">.*?</a></li>","",links)
#
## loop over the links, getting the html, and writing it out to files
#for(i in 1:length(links)){
#  currenturl <- links[i]
#  dat <- readLines(currenturl)
#  currentpath <- paste("sss_html/",strsplit(currenturl,"/")[[1]][7],"_",strsplit(currenturl,"/")[[1]][8],".htm",sep="")
#  write(dat,currentpath)
#}

## get the files and parse them into blog posts
## RUN PYTHON: parseSSShtml.py
## (because the regular expressions are easier in python)

## UPDATE 4/20/2015. Start here with the parsed blog posts.
## load the docs using the tm package
## this is where the files are store
loc <- paste(mydir,"sss_posts",sep="")
paths <- DirSource(loc)
## This creates the corpus in R
(corp <- Corpus(paths))
## this creates a document-term matrix
dtm <- DocumentTermMatrix(corp, 
         control = list(tolower=T, stemming = T, stopwords = T, minWordLength = 3,
         removeNumbers = T, removePunctuation = list(preserve_intra_word_dashes = TRUE)))
## some quick changes to the matrix.
dtm <- as.data.frame(as.matrix(dtm))
dtm <- apply(dtm,MAR=2,as.integer)
## add rownames to the dtm
dtmDocNames <- gsub(paste(mydir,"sss_posts/",sep=""),"",paths$filelist,fixed=T)
rownames(dtm) <- dtmDocNames

## we can spot-check that the docs got matched up well
## (we are looking at the correspondence of words to counts)
sort(dtm[1,][dtm[1,]!=0])
corp[[rownames(dtm)[1]]]
## spot-check one more
sort(dtm[100,][dtm[100,]!=0])
corp[[rownames(dtm)[100]]]


###############################################################
## Unsupervised analysis - LDA
###############################################################

## This section uses LDA (topic models) to explore some aspects
## of the IQSS blog posts

## This is an LDA wrapper function (hat tip: Brandon Stewart)
LDA <- function(data, K, num.it=500, num.words=15) {
  require(lda)
  documents <- apply(data, 1, function(y) {
    x <- rbind(as.integer(which(y > 0) - 1L),
          as.integer(y[y > 0]))
    colnames(x) <- names(y)[which(y > 0)]
    return(x)
  })
   ## I set a seed internally so that things are reproduceable
   set.seed(02138)
   print("starting LDA")
   ## This is the main function
   result <- lda.collapsed.gibbs.sampler(documents,
                                       K,  ## Num clusters
                                       colnames(data),
                                       num.it,  ## Num iterations
                                       .1,
                                       .1)
  #top.words <- top.topic.words(result$topics, num.words, by.score=TRUE)
  return(result)
}
## End LDA function


## Now, we run the LDA analysis
## I pick 6 topics because I tried several options and liked 6 best
## [There is no "right" number of topics]
K <- 6
## We will time the LDA run (on large data sets it can take some time)
set.seed(1234)
t0 <- Sys.time()
## syntax is LDA(matrix, ntopics, niterations)
LDAout <- LDA(dtm, K, 2000)
t1 <- Sys.time()
t1-t0
alarm()

## We have to look at words and documents to infer the topics
## One good way of doing this is frequency, exclusivity, and 
## FREX (FRequency+EXclusivity).  I used code from Molly Roberts
## and Brandon Stewart:

## get the topic-term matrix
topicmat <- LDAout$topics
## molly and brandon appear to have words in the rows and topics in the colums in their code
topicmat <- t(topicmat)

## make holders to hold the Freq, Excl, and FREX word lists
ntopics <- K
nwords <- max(dim(topicmat))
frex <- matrix(nrow=nwords, ncol=ntopics)
rownames(frex) <- rownames(topicmat)
colnames(frex) <- colnames(topicmat)
freq <- frex
excl <- frex
## done making the holders

## calculate Freq, Exlc, and FREX in a loop
for(i in 1:ncol(topicmat)){
  s <- apply(topicmat,1,sum)
  if(sum(s==0)>0){print("Dividing by zero")}
  ecdf1 <- ecdf(topicmat[,i]/s)
  ecdf11 <-  ecdf1(topicmat[,i]/s)
  excl[,i] <- ecdf11
  ecdf2 <- ecdf(topicmat[,i])
  ecdf22 <- ecdf2(topicmat[,i])
  freq[,i] <- ecdf22
  frex[,i] <- 1/(.5/ecdf11 + (1-.5)/ecdf22) 
}

## This is how many words I want in my lists
nTopWords <- 25
## Now I make the word lists
freqTop <- apply(freq,2, function(x){rownames(freq)[rev(order(x))][1:nTopWords]})
exclTop <- apply(excl,2, function(x){rownames(excl)[rev(order(x))][1:nTopWords]})
frexTop <- apply(frex,2, function(x){rownames(frex)[rev(order(x))][1:nTopWords]})

## Let's start looking at the lists
freqTop[1:15,]  ## useful
exclTop[1:15,]  ## NOT VERY USEFUL
frexTop[1:15,]  ## useful

## we can create a combined list for each topic if that's useful
for(i in 1:K){
  myTopic <- i
  cat(paste("\n\n\nTopic",i,"\n\n"))
  print(cbind(freqTop[,myTopic],exclTop[,myTopic],frexTop[,myTopic]))
}

## figure out which topics are most represented and which documents are most
## representative of a single topic.

## calculate document proportions
docProportions <- apply(LDAout$document_sums,2, function(x){x/(sum(x))})
colnames(docProportions) <- dtmDocNames

## most representative docs for a topic (WHAT IS TOPIC 1?)
mytopic <- 1
topDocs <- docProportions[,rev(order(docProportions[mytopic,]))][mytopic,1:5]
topDocs
## we can check what the post is about
corp[[names(topDocs)[1]]]
corp[[names(topDocs)[2]]]
corp[[names(topDocs)[3]]]
## This can help us understand a topic!

mytopic <- 2
topDocs <- docProportions[,rev(order(docProportions[mytopic,]))][mytopic,1:5]
corp[[names(topDocs)[1]]]
corp[[names(topDocs)[2]]]
corp[[names(topDocs)[3]]]

## I'm labeling the topics (NOTE, a LOT of inference is happening here!!!)
topiclabels <- c("vis/code","results","misc","causal inf.","workshop","Amer. politics")
abbrevlabels <- c("v","r","m","c","w","a")

## overal topic proportions
barplot(as.vector(LDAout$topic_sums/(sum(LDAout$topic_sums))), width=.81)
## add words to label the toptics
myss <- seq(-.01,-.08,length.out=5)
for(i in 1:K){
  for(j in 1:5){
    text((i-1)*.98,myss[j],freqTop[j,i],xpd=NA,pos=4, cex=.8)
  }
}


## bring in the meta-data to look at topics and some covariates
## NOTE: the right way to do this is using a Structural Topic model:
## http://structuraltopicmodel.com/
## But we are going to be a little loose and fast.

## read in the covariate data we made
dat <- read.csv("sss_data.csv",as.is=T)
head(dat)
## make a new author field
dat$au <- dat$author
dat$au[dat$au==""] <- dat$poster[dat$au==""]
dat$au
## make some fixes by hand
dat$au[dat$au=="Drew Thomas"] <- "Andrew C. Thomas"
dat$au[dat$au=="You Jong-Sung"] <- "Jong-Sung You"
dat$au[dat$au=="Mike Kellermann"] <- "Michael Kellermann"
dat$au[dat$au=="Jim Greiner"] <- "James Greiner"

## clean up the dates ("%Y-%m-%d")
years <- sapply(dat$date,function(x){strsplit(x," ")[[1]][3]})
days <- sapply(dat$date,function(x){strsplit(x," ")[[1]][1]})
months <- sapply(dat$date,function(x){strsplit(x," ")[[1]][2]})
paste(years,months,days,sep="-")
dat$postdate <- as.Date(paste(years,months,days,sep="-"), format = "%Y-%B-%d")

## look at topic trends over date and author
unique(dat$au)
dsums <- LDAout$document_sums
colnames(dsums) <- dtmDocNames
## Look at the topics of my own posts
dat$filename[dat$au=="Richard Nielsen"]
mydsums <- dsums[,dat$filename[dat$au=="Richard Nielsen"]]
## What proportion of words did I devote to each topic
myTopicProportions <- apply(mydsums,1,sum)/sum(apply(mydsums,1,sum))
## Make the barplot
barplot(myTopicProportions, type="l", main="Richard Nielsen")
## add word labels
myss <- seq(-.01,-.1,length.out=5)
for(i in 1:K){
  for(j in 1:5){
    text((i-1)*1.2,myss[j],freqTop[j,i],xpd=NA,pos=4, cex=.8)
  }
}


## Let's do this for some authors we know
authorlist <- c("Gary King","Jens Hainmueller","Justin Grimmer","Eleanor Neff Powell",
                "Matt Blackwell","Richard Nielsen","Maya Sen","Jeff Gill")
## how many posts does each author have?
numposts <- sapply(unique(dat$au),function(x){sum(dat$au==x)})
sort(numposts)
## make comparison barplots
par(mfrow=c(2,4))
for(i in 1:length(authorlist)){
  mydsums <- dsums[,dat$filename[dat$au==authorlist[i]]]
  myTopicProportions <- apply(mydsums,1,sum)/sum(apply(mydsums,1,sum))
  names( myTopicProportions) <- abbrevlabels
  barplot(myTopicProportions, type="l",main=paste(authorlist[i],"\n(",numposts[authorlist[i]]," posts)",sep=""), ylim=c(0,1))
}
## Harvard folks are not as obsessed with causal inference as you'd think!

## Do k-means on the author-topic proportions
## get the full list of authors now
authorlist <- unique(dat$au)
## skip the ones with few posts
skip <- c("John Friedman and guest blogger Richard Holden","Sebastian Bauhoff and Jens Hainmueller","SSS Coauthors","Eric Werker (guest author)","Janet Rosenbaum (guest blogger)","Jens Hainmueller and Michael Hiscox")
authorlist <- authorlist[!authorlist %in% skip]
## make a data set with the topic proportions for each author
atopicsums <- matrix(NA,length(authorlist),K)
rownames(atopicsums) <- authorlist
for(i in 1:length(authorlist)){
  mydsums <- dsums[,dat$filename[dat$au==authorlist[i]]]
  atopicsums[i,] <- apply(mydsums,1,sum)/sum(apply(mydsums,1,sum))
}
## make it into a data frame
atopicsumsdat <- as.data.frame(atopicsums)
## run the kmeans algorithm
set.seed(12345)
km <- kmeans(atopicsumsdat, centers=5, nstart=25)
## look at the results
sort(km$cluster)
## Now, look at the results graphically
## make a function for creating ggplot-type colors
gg_color_hue <- function(n) {
  hues = seq(15, 375, length=n+1)
  hcl(h=hues, l=65, c=100)[1:n]
}
## make a color wheel with 5 colors
mycols <- gg_color_hue(5)
## create a plot showing the barplots for each author and cluster
par(mfrow=c(5,5),mar=c(2.5,2,2.5,1))
for(i in 1:length(authorlist)){
  alist = names(sort(km$cluster))
  mydsums <- dsums[,dat$filename[dat$au==alist[i]]]
  myTopicProportions <- apply(mydsums,1,sum)/sum(apply(mydsums,1,sum))
  names( myTopicProportions) <- abbrevlabels
  barplot(myTopicProportions, type="l",main=paste(alist[i],"\n(",numposts[alist[i]]," posts)",sep=""), ylim=c(0,1),
          col=mycols[km$cluster[alist[i]]])
}


## trends in topic proportion over time
## We need to create time bins -- I just use blocks of 180 days
ss <- seq(min(dat$postdate,na.rm=T),max(dat$postdate,na.rm=T),180)
## make a data frame with topic proportions by time block
dtopicsums <- matrix(NA,length(ss)-1,K)
rownames(dtopicsums) <- paste(ss[1:(length(ss)-1)]," to ",ss[2:(length(ss))],sep="")
for(i in 1:(length(ss)-1)){
  mydsums <- dsums[,na.omit(dat$filename[dat$postdate >= ss[i] & dat$postdat < ss[i+1]])]
  dtopicsums[i,] <- apply(mydsums,1,sum)/sum(apply(mydsums,1,sum))
}
## look at it to make sure it worked
dtopicsums

## plot the trends over time
mycols <- gg_color_hue(K)
dev.off()
par(mar=c(5,4,1,1))
plot(x=ss[1:(length(ss)-1)],y=dtopicsums[,1], type="n",ylim=c(0,1),
  ylab="proportion",xlab="time")
polygon(x=c(0,100000,100000,0),y=c(-10,-10,100,100),col="gray95")
for(i in 1:K){
  lines(x=ss[1:(length(ss)-1)],y=dtopicsums[,i], col=mycols[i])
}
legend("topright",lty=rep(1,K),col=mycols, legend=topiclabels, bty="n")



###############################################################
## Supervised analysis - Naive Bayes classification
###############################################################

## In this section, we'll hand-code some gold standard data
## and then use it to classify the rest of the posts.  I'm
## going to code 15 posts as "Causal Inference" or "Not" and
## Then classify the rest automatially.

##  randomly sample and code for causal inference
set.seed(567)
mysamp <- sample(dtmDocNames,15, replace=F)
## the sample should be this:
## [1] "2009_04_7.txt"  "2010_12_3.txt"  "2008_09_0.txt"  "2007_10_10.txt" "2006_05_8.txt"  "2006_04_7.txt"  "2005_11_12.txt"
## [8] "2007_10_7.txt"  "2008_04_11.txt" "2007_11_9.txt"  "2006_11_2.txt"  "2006_08_0.txt"  "2006_12_1.txt"  "2008_04_10.txt"
## [15] "2005_09_2.txt"

## I manually iterated through indexes here to read and code them
## There are ways to build an input form using tcl/tk that makes this easier
corp[[mysamp[15]]]


## question while coding: "Is this post at least partially about causal inference?"
coding <- c(1,0,1,0,0,
            0,0,0,0,0,
            0,0,1,0,1)

## naive bayes function
nbayes <- function(mat, class){   ## This function takes a document-term matrix and a list of document classes
  V <- colnames(mat)
  B <- length(V)
  C <- unique(class)
  ## For each class, calculate the word probabilities given the class
  wordprobs <- matrix(NA,B,length(C))  ## make a matrix  that has as many rows as words and as many columns as classes
  rownames(wordprobs) <- colnames(mat) ## name the rows of the matrix to be the words
  colnames(wordprobs) <- C             ## name the columns of the matrix to be the classes
  for(i in 1:length(C)){   ## For each class..
    sub.mat <- mat[class==C[i],,drop=FALSE]  ## subset the matrix to just the current class
    wordsums <- apply(sub.mat,MAR=2, sum)  ## take the column sums of the sub-matrix
    ## add the laplace prior of +1  
    wordprobs[,i] <- (wordsums + 1)/(sum(sub.mat) + B)  ## calculate the word "probabilities" and store them
  }
  return(wordprobs) ## return the matrix of word probabilities
}

## This is a function for calculating scores given a TDM and the output of nbayes
nbpredict2 <- function(tdm, nbayes.obj){
  holder <- rep(NA,nrow(tdm))  ## make a place to hold the output
  names(holder) <- rownames(tdm)  ## add document names to the holder
  lognbratio <- log(nbayes.obj[,1]/nbayes.obj[,2])  ## calculate the log likelihood ratio for each word
  for(i in 1:nrow(tdm)){  ## for each document...
    holder[i] <- sum(lognbratio*tdm[i,])  ## calculate the score, given the words in the document
  }
  return(holder)  ## return the holder
} 


## An example showing what it does
## THIS IS NOT THE IQSS BLOG
## I am working the example from here: http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
mat <- matrix(NA,4,6)
colnames(mat) <- c("Chinese","Beijing","Shanghai","Macao","Tokyo","Japan")
rownames(mat) <- c("1","2","3","4")
mat[1,] <- c(2,1,0,0,0,0)
mat[2,] <- c(2,0,1,0,0,0)
mat[3,] <- c(1,0,0,1,0,0)
mat[4,] <- c(1,0,0,0,1,1)
## this is the final document-term matrix
mat
## this is the vector saying what class they are
class <- c("china","china","china","not")

tmp <- nbayes(mat,class)
nbpredict2(mat, tmp)
## Output should be
#        1         2         3         4 
# 1.564874  1.564874  0.908094 -1.613180  


## Now, we apply it to the IQSS blog
## first, pull out the training data
trainingDtm <- dtm[mysamp,]
dim(trainingDtm)

## create the vector with doc classes
coding2 <- rep(NA,length(coding))
coding2[coding==1] <- "causal"
coding2[coding!=1] <- "not"
## run the model
nbmodel <- nbayes(trainingDtm,coding2)
## we can see which words matter most
rev(sort(nbmodel[,"causal"]))[1:50]
## test it on the training set to make sure it's working
nbest <- nbpredict2(trainingDtm,nbmodel)
cbind(nbest,coding)
## seperate out the unclassified posts and classify them
testdtm <- dtm[-which(rownames(dtm) %in% mysamp),]
nbestFull <- nbpredict2(testdtm,nbmodel)
## look at the ones classified as causal
sort(nbestFull)
corp[[names(rev(sort(nbestFull))[1])]]
corp[[names(rev(sort(nbestFull))[2])]]
corp[[names(rev(sort(nbestFull))[3])]]
## Looks like it's working 

## make a word cloud of the posts classified as causal
hist(nbestFull)
abline(v=0)
causalwords <- apply(testdtm[nbestFull>0,],2,sum)
noncausalwords <- apply(testdtm[nbestFull<0,],2,sum)
par(mfrow=c(1,2))
set.seed(123)
wordcloud(words=names(causalwords),freq=causalwords,min.freq=10)
wordcloud(words=names(noncausalwords),freq=noncausalwords,min.freq=60)
## NOTE: This package isn't my favorite -- if you want to get serious
## about making a word cloud, I would hand-place the words (with some 
## machine assistance).  If that's not worth doing, then the word
## cloud probably isn't worth presenting.

## now that we have coding for "causal" posts, we can look at
## the proportion over time

## classify all the docs (training and not)
classif <- nbpredict2(dtm,nbmodel)
causal <- as.numeric(classif>0)
names(causal) <- names(classif)
## make a data set with the proportion of causal docs by time period
cprop <- rep(NA,length(ss)-1)
names(cprop) <- paste(ss[1:(length(ss)-1)]," to ",ss[2:(length(ss))],sep="")
cpropw <- cprop
## start the loop
for(i in 1:(length(ss)-1)){
  ## this is the proportion of DOCS that are causal
  cprop[i] <- mean(causal[na.omit(dat$filename[dat$postdate >= ss[i] & dat$postdat < ss[i+1]])])
  ## this is the proportion of WORDS that are causal
  docWordCount <- apply(dsums,2,sum)
  causalT <- causal[na.omit(dat$filename[dat$postdate >= ss[i] & dat$postdat < ss[i+1]])]
  cpropw[i] <- sum(docWordCount[names(causalT[causalT==1])])/sum(docWordCount[names(causalT)])
}

## plot the trends over time
mycols <- gg_color_hue(3)
dev.off()
par(mar=c(5,4,1,1))
plot(x=ss[1:(length(ss)-1)],y=dtopicsums[,1], type="n",ylim=c(0,1),
  ylab="proportion",xlab="time")
polygon(x=c(0,100000,100000,0),y=c(-10,-10,100,100),col="gray95")
## plot the causal topic
lines(x=ss[1:(length(ss)-1)],y=dtopicsums[,4], col=mycols[1])
## plot the prop of causal DOCS
lines(x=ss[1:(length(ss)-1)],y=cprop, col=mycols[2])
## plot the NB prop of causal WORDS
lines(x=ss[1:(length(ss)-1)],y=cpropw, col=mycols[3])
legend("topright",lty=rep(1,K),col=mycols, legend=c("prop. LDA \"causal\" topic","prop. documents classified as causal by NB","prop. words classified as causal by NB"), bty="n")

## They are correlated but different
summary(lm(cprop ~ dtopicsums[,4]))
summary(lm(cpropw ~ dtopicsums[,4]))


##################################################################
## Arabic example
##################################################################

## This example uses topic models for preliminary exploration of
## news articles from al-ahram the day after Mubarak stepped down.

## source the stemmer
library(arabicStemR)
packageVersion("arabicStemR") ## should be 1.1

## bring in the documents
fl <- list.files(paste(mydir,"ahram2011-02-12/",sep=""))

## stem the files and save them
textholderStemlist <- textholderStemmed <- list()
for(i in 1:length(fl)){
  tmp <- readLines(paste(paste(mydir,"ahram2011-02-12/",sep=""),fl[i],sep=""),encoding = "UTF-8")
  tmp <- paste(tmp,collapse=" ")
  tmp <- stem(tmp, returnStemList=T)
  textholderStemmed[[i]] <- tmp$text
  textholderStemlist[[i]] <- tmp$stemlist
}

textholderStemmed

library(tm)
library(topicmodels)
library(lda)


## Make a corpus out of the stemmed docs
paths <- VectorSource(textholderStemmed)
(corp <- Corpus(paths))

## make a document-term matrix
dtm <- DocumentTermMatrix(corp, 
         control = list(tolower=F, stemming = F, stopwords = F, minWordLength = 1,
         removeNumbers = F))

dim(dtm)

dtm2 <- removeSparseTerms(dtm, 0.9)
dim(dtm2)
dtm3 <- as.data.frame(as.matrix(dtm2))
dtm4 <- apply(dtm3,MAR=2,as.integer)
## add the document names
fl
dtmDocNames <- fl
rownames(dtm4)<- dtmDocNames
## here, need to make dtm4 into dtm3
dtm3 <- dtm4
rm(dtm4)
head(dtm3)
dim(dtm3)
dtm3[1:10,1:10]
dtm3 <- data.frame(dtm3)


###############
# LDA Function (same as before)
###############
LDA <- function(data, K, num.it=500, num.words=15) {
  require(lda)
  #data <- data[,-c(1:189)] 
  #documents <- apply(data, 1, function(y) {
  #  rbind(which(y > 0) - 1L,
  #        y[y > 0])
  # })

  ## the word zyn0 is giving me problems
  #data[["zyn0"]] <- NULL
  
  documents <- apply(data, 1, function(y) {
    x <- rbind(as.integer(which(y > 0) - 1L),
          as.integer(y[y > 0]))
    colnames(x) <- names(y)[which(y > 0)]
    return(x)
  })

  ## Make sure the words line up because brandon had some subtraction
  #cbind(names(data[1,])[data[1,]>0],
  #colnames(documents[[1]]))
  
   set.seed(02138)
   print("starting LDA")
   result <- lda.collapsed.gibbs.sampler(documents,
                                       K,  ## Num clusters
                                       colnames(data),
                                       num.it,  ## Num iterations
                                       .1,
                                       .1)
  #top.words <- top.topic.words(result$topics, num.words, by.score=TRUE)
  return(list(result=result,documents=documents))
}
###############


## LDA
set.seed(1234)
t0 <- Sys.time()
## syntax is LDA(matrix, ntopics, niterations)
K <- 3
LDAout <- LDA(dtm3, K, 2000)
t1 <- Sys.time()
t1-t0
alarm()

LDAout <- LDAout$result

## FREX scores from Molly and Brandon
topicmat <- LDAout$topics
## molly and brandon appear to have words in the rows and topics in the colums in their code
topicmat <- t(topicmat)

## prep holders for running the frex calculation
ntopics <- K
nwords <- max(dim(topicmat))
frex <- matrix(nrow=nwords, ncol=ntopics)
rownames(frex) <- rownames(topicmat)
colnames(frex) <- colnames(topicmat)
freq <- frex
excl <- frex
dim(frex)

## run the frex calculation
for(i in 1:ncol(topicmat)){
  print(i)
  s <- apply(topicmat,1,sum)
  if(sum(s==0)>0){print("Dividing by zero")}
  ecdf1 <- ecdf(topicmat[,i]/s)
  ecdf11 <-  ecdf1(topicmat[,i]/s)
  excl[,i] <- ecdf11
  ecdf2 <- ecdf(topicmat[,i])
  ecdf22 <- ecdf2(topicmat[,i])
  freq[,i] <- ecdf22
  frex[,i] <- 1/(.5/ecdf11 + (1-.5)/ecdf22) 
}

nTopWords <- 20

freqTop <- apply(freq,2, function(x){gsub("X","",rownames(freq))[rev(order(x))][1:nTopWords]})
exclTop <- apply(excl,2, function(x){gsub("X","",rownames(excl))[rev(order(x))][1:nTopWords]})
frexTop <- apply(frex,2, function(x){gsub("X","",rownames(frex))[rev(order(x))][1:nTopWords]})

## redo: filling in the most common stems
## a function for finding the most common stem from the stemlist
findMostCommonStem <- function(mystem){
  out <- names(rev(sort(table(names((unlist(textholderStemlist))[(unlist(textholderStemlist))==reverse.transliterate(mystem)]))))[1])
  return(out)
}

freqTop <- matrix(sapply(freqTop,findMostCommonStem),nrow(freqTop),ncol(freqTop))
exclTop <- matrix(sapply(exclTop,findMostCommonStem),nrow(exclTop),ncol(exclTop))
frexTop <- matrix(sapply(frexTop,findMostCommonStem),nrow(frexTop),ncol(frexTop))

## Then replace with arabic
freqTop <- matrix(sapply(freqTop,reverse.transliterate),nrow(freqTop),ncol(freqTop))
exclTop <- matrix(sapply(exclTop,reverse.transliterate),nrow(exclTop),ncol(exclTop))
frexTop <- matrix(sapply(frexTop,reverse.transliterate),nrow(frexTop),ncol(frexTop))

freqTop  ## NOTE THAT ARABIC DISPLAYS IN THE INCORRECT COLUMNS!
## ...and from here, everything is the same as in English

############################################################################
## END DEMONSTRATION



