## R code for the "Quantitative Text Analysis in Arabic" Workshop
## Rich Nielsen
## Cairo University, March-April 2019

## This file is split into sections
## 1. Acquiring Text
## 2. From Text to Data
## 3. Topic Models
## 4. Classification
## 5. Visualization
## 6. Twitter Data

## Each section builds on the results of the last section, so you must 
## start the code running from the beginning each time.


## Beginning of exercises for "Acquiring Text"

## Welcome to R.  If it is your first time programming in this computer language
## it is traditional to begin by coding the machine to print the phrase "Hello World"
## In the R GUI, you can run a line (or selection) by moving the cursor there
## or highlighting and typing Ctrl+R

print("Hello World")

## Now, print it in Arabic using the line below:

print("مرحبا للعالم")

## R does not understand Arabic script, so we have to enter it as Unicode.  More below.

## Comments in R:
## R will not try to run anything following a #.  We call this a "comment."
## While a single # will do, I use ## for legibility.
## It is *very* important to comment your code so you can remember what each piece does.

## R can do a variety of mathematical calculations

1 + 1

(3+4)/(10+1) + 7*2

## R is object-oriented.  It "thinks" in terms of objects saved in memory
## We assign values to an object using the "gets" operator <- 

x <- 1

## Now, x has the value "1" and we can work with it

x + x


## R can do a variety of "string" (text) manipulations.
## Enter a string using quote marks
## We can make an object that is the string "Hello World"

myString <- "Hello World"

## Now, when we look at the "myString" object, we get the value "Hello World"

myString

## We can put items together with the concatenate function c():

twoStrings <- c("Hello","World")

twoStrings

## We can use [ ] brackets to refer to an item by its index (starting from 1)

twoStrings[1]

twoStrings[2]

## We can also get everything EXCEPT a specified element by using the - sign in [ ]

twoStrings[-1]

## And we can paste those two strings together

paste(twoStrings[1], twoStrings[2])

## If we want to paste them together with no space, we use paste0()

paste0(twoStrings[1], twoStrings[2])

## R has a number of structures for data.  

## We have already seen vectors...elements together with each
## element of the same type:

a <- c(1,2,3,4)

a

b <- c("a","b","c","d")

b

## we can assign a vector to have names (but the length must match)

names(a) <- b

a

## Remember, we grab elements of a vector using [ ]

b[1]
b[1:3]
b[c(1,4)]

## And we can refer to an element by name

a["a"]

a[c("a","c")]

## A matrix is an array of items 

c <- matrix(c(1,2,3,4), nrow = 2, ncol = 2)
c

## it can also have names

rownames(c) <- c("a","b")
colnames(c) <- c("c","d")

c

## we refer to elements using the row and column indicators in [ ]

c[1,1]

c["a","c"]

c[1,]

c[,2] ## Note, this turns it into a vector


## A list is an object that can hold more complex objects of different types

d <- list(a = a, b = b, c = c)

d

## we refer to elements of a list using [[ ]] and sub-elements using either [ ] or [[ ]]
## as appropriate.

d[[1]]

d[[1]]["a"]

## R also has a structure called a data frame.  It is like a matrix in some ways and a list
## in others.

e <- data.frame(a = a, b = b)

## like a matrix, you can subset it by variable name
e[,"a"]

## but each variable can also be referred to as an element in a list

e[["a"]]

## We can assign rownames as before. They must be unique (two can't have the same name).

rownames(e) <- c("123","456","789","101112")


## can assign the column / variable names with names() or colnames()

colnames(e)

names(e) <- c("Var1","Var2")

## We can look at the top of a data frame with head()
head(e)

## Below, we will use a coding structure called a "for loop".  It tells the computer
## to iterate over a set of items and do something each time.

## We use the concatenate function to make our vector:

x <- c("Welcome","to","the","text","analysis","workshop")

## Then we start a for loop.  The "for(" starts the command.  
## The "i" is our counter that will go to the next element each time
## 1:length(x) spcifies what the loop is looping over.  You can see that
## that the ":" symbol will give us a range of numbers

## This gives us a range of numbers 1 to 10
1:10

## And this gives us a range of numbers 1 through however long the object "x" is:

1:length(x)

## Back to the for loop, the { } curly braces will contain the commands to execute
## each time.  This look executes two commands.  First we print the current value of 
## "i".  Then we print the i-th value of x, which we specify using [ ] square braces.

for(i in 1:length(x)){
  print(i)
  print(x[i])
}

## R can save files and load files from memory.  To do this, R needs to know
## where the files are located.  This requires that you use specify the 
## directory.

## Many computer users navigate files by pointing and clicking, but R can't
## do that.  Instead we specify a path with code.

## To see the current working directory:

getwd()

## For this tutorial, you should set the working directory where you 
## have saved and unzipped the tutorial materials.
## For me that is here:

myDirectory <- "C:/Users/Richard Nielsen/Dropbox (MIT)/01_Papers/arabicText/workshop/arabicTextAnalysis"

## REPLACE MY PATH WITH YOURS IN THE LINE ABOVE

## Note that it MUST be separated by forward slashes /
## If you use backslashes \, then you must use two each time \\

## Set the working directory

setwd(myDirectory)


## Regular expressions (regex) is a way to manipulate strings in R.

## grep() finds the element that matches

## For example, imagine we want to match parts of this vector

x <- c("A","collection","of","strings")

## We can get the index of the element that matches "collection":

grep("collection",x)

## We can also match parts of words:

grep("i",x)

## gsub() replaces one string with another string. 

gsub("collection","COLLECTION",x)


## R has many other basic building blocks, but this is hopefully enough to
## get started with the code for text analysis.


##########
## Collect a single document from the Al-Ahram online archive and save the text
##########

## We can bring in text from a variety of sources with the "readLines" command

rawHtml <- readLines("http://www.ahram.org.eg/archive/The-First/News/1230.aspx", encoding="UTF-8")

rawHtml

## We can save this to a place on our computer.  We need to indicate a directory
## where we want to save it.

dataDirectory <- "output/"

## We need to make this directory because it doesn't yet exist.

dir.create(dataDirectory)

## We also need to change the encoding of the file to "bytes"

Encoding(rawHtml) <- "bytes"

## And then we can write it out using the writeLines() command.

writeLines(rawHtml, paste0(dataDirectory, "1230.txt"))

## The text we just saved is in the html language.  We probably want to
## remove the hteml and save just the text of the newspaper article.

## We need to change the encoding back to "UTF-8"

Encoding(rawHtml) <- "UTF-8"

## If we look at the html code, we can see that the title is accompanied by
## the html tag "bbtitle".  We will use this in a "regular expression," also
## called "regex", to pull out the title text.

## Use grep() to identify the title

title <- rawHtml[grep("bbtitle",rawHtml)]

## Complex gsub expression (in regex syntax) to replace everything 
## inside < > brackets with nothing.

title <- gsub("<.*?>","",title)

## trimws() will clear leading and trailing spaces

title <- trimws(title)

## Now we have the title of the article

print(title)

## If we want to do the same for the text of the article, we do similar steps.
## Looking at the html code, we can see that the article is preceded by the
## label "abstractDiv" and that the tag "<!-- AddThis Button BEGIN" comes after.
## We use this to pull out the part of the html code with the article text.

## Again, we use grep to match the strings

text <- rawHtml[grep("abstractDiv",rawHtml):grep("<!-- AddThis Button BEGIN",rawHtml)]

## Then we substitute out one of the strings (because the regex for < > won't work).

text <- gsub("<!-- AddThis Button BEGIN","",text)

## Then we replace use gsub to get rid of everything in < > brackets

text <- gsub("<.*?>", "", text)

## Then we replace a stray set of characters with gsub().

text <- gsub("&nbsp;","",text)

## We use trimws() to trim the leading and trailing spaces

text <- trimws(text)

## We remove all of the lines that don't have anything anymore.

text <- text[text!=""]

## Now we can look at the resulting text

print(text)

## We might want to combine the title and the text and save them.
## We can combine strings with the c() function which concatenates them.
## We can name the resulting object anything we want.

fullText <- c(title,text)

## We use the Encoding() command to change the encoding to "bytes"

Encoding(fullText) <- "bytes"

## And then we can save the result.  I will overwrite our prior
## saved result.

writeLines(fullText, paste0(dataDirectory, "1230.txt"))


##########
## OCR for PDF and image files
##########

## What if text comes as a PDF or other analog format?  
## To "read" the document, we need to apply Optical Character Recognition

## The tesseract library provides a free solution
## Libraries (also called packages) are extra software that you can install to increase 
## the capabilities of R.  They are contributed for free by R users.  The first time you use
## a new library, you will need to install it with the install.packages() function:

#install.packages("tesseract")  ## Commented out because I don't want to reinstall it each time.

## Load the library

library(tesseract)

## Download the Arabic script for the tesseract library

tesseract_download("ara")

## Specify the file path to the PDF.  

pdfpath <- "data/ahramArticlePDF.pdf"

## If the PDF is online, this could be a web link.  I have saved the same
## pdf online here.

#pdfpath <- "https://www.mit.edu/~rnielsen/ahramArticlePDF.pdf"

## make an object that is the Arabic Engine for Tesseract

ara <- tesseract("ara")

## Then run the ocr function and get the result

ocrResult <- tesseract::ocr(pdfpath, engine = ara)

## To save the result, we again need to encode it as "bytes"

Encoding(ocrResult) <- "bytes"

## Write out to our preferred file path

writeLines(ocrResult, "output/ahramArticleOCR.txt")


## What if we had a photo instead of a PDF (working with archives, for example)?

## Here is a photograph of a printed copy of the Al-Ahram Article

jpgpath <- "data/ahramArticlePicture.JPG"

## Again, I have saved it online so you can try getting it from my website.

#jpgpath <- "https://www.mit.edu/~rnielsen/ahramArticlePicture.JPG"


## Run the tesseract Arabic engine

ocrResult2 <- tesseract::ocr(jpgpath, engine = ara)

## Encode the result as "bytes"

Encoding(ocrResult2) <- "bytes"

## Save the result.

writeLines(ocrResult2, "output/ahramArticlePictureOCR.txt")


##########
## [SKIP] Collect several week's worth of articles from the Al-Ahram online archive and save the text
##########

## This code is commented out because I don't want to have everyone
## scrape the Al-Ahram archive all at once.

#source("aux_code/scrape_ahram_archive.R")

## This will take a long time to run because the Al-Ahram website loads each page slowly.
## At the end, you will have a set of directories names by date.  Within each, you'll
## have an index page, subindex pages, and the article html, all in separate files.


##########
## Collect two weeks of Al-Ahram articles
##########

## Using regular expressions, I navigated the hyperlink structure of the Al-Ahram
## archive to download two weeks of articles.  I would normally walk you through
## that code, but I won't for two reasons: 1) the code is somewhat complex.  2) I 
## don't want to overload the Al-Ahram website.

## I have load the files to my own website, listing them at this "blind" link:
## https://www.mit.edu/~rnielsen/ahram/index.html

index <- readLines("https://www.mit.edu/~rnielsen/ahram/index.html")

head(index)

## If the code below won't work, you can also download all of the files at
## http://www.mit.edu/~rnielsen/ahramArticles.zip

## We can see that the structure is very simple.  All files are listed
## with hyperlinks.  Our goal is to extract the hyperlinks and visit
## each one.

## Because the links are given as the "name" of the links, we could just 
## remove everything in html tags < > with this regular expression.

links <- gsub("<.*?>","",index)

## However, that usually won't be the case.  We can extract links using
## regular expressions like this:

links <- gsub("^.*?href=\"","",index)

links 

links <- gsub(">.*?$","", links)



## Then we visit each link and save the result
## Recall that the links don't include the full url

## first, create a directory to save the results

getwd()


## Create a directory to save all of the results

dir.create("ahramArticles/")

## Loop over the links (NOTE: takes ~ 15 minutes)

for(i in 1:length(links)){

  ## print the counter to keep track of progress

  print(paste(i,"of",length(links),"links"))

  ## construct the filepath to save first
  
  savePath <- paste0("ahramArticles/",links[i])

  ## skip if we have already collected the file

  if(file.exists(savePath)){
    print(paste(savePath,"already exists"))
    next
  }

  ## read in the file with readlines()

  tmp <- readLines(paste0("https://www.mit.edu/~rnielsen/ahram/",links[i]), encoding="UTF-8")
  
  ## get the date of the article so we can organize them in separate directories

  currentDate <- strsplit(links[i],"/")[[1]][1]

  ## Create the dated directory if it doesn't yet exist.

  if(!dir.exists(paste0("ahramArticles/",currentDate))){ dir.create(paste0("ahramArticles/",currentDate))}
  
  ## change the encoding

  Encoding(tmp) <- "bytes"

  ## save the file

  writeLines(tmp, savePath)
}

## Look at the files on your computer to confirm that this loop downloaded
## them correctly.

##########
## Encodings (from http://www.mit.edu/~rnielsen/helpful.htm)
##########

## Computers can't natively represent Arabic characters.  They have to
## be "encoded."  When you type in Arabic in a computer, it is
## interpreting the letters using some encoding that represents the letters
## in ASCII characters.

## here's some arabic.  We are entering it using the unicode designations.
## https://unicode.org/charts/PDF/U0600.pdf

a <- "\u628\u633\u645 \uFDF2 \u0627\u0644\u0631\u062D\u0645\u0646 \u0627\u0644\u0631\u062D\u064A\u0645"
a

## It prints nicely in R

## We can see that it is encoded in UTF-8

Encoding(a)

## If you want to write it out to a text file, first change the encoding to "bytes"

Encoding(a) <- "bytes"

## This will write it out to a text file

writeLines(a ,"output/out.txt")

## Surprisingly, this won't work in Base R. It writes out in ASCII.  Not super helpful.
## It does seem to work in RStudio thought...go figure.

write(a,"output/out2.txt")

## Reading in utf-8 text:

b <- readLines("output/out.txt",encoding = "UTF-8")
b

## sometimes, arabic gets dumped into files and looks like this:

x <- "<U+0627><U+0644><U+0635><U+0641><U+062D><U+0629>"
x

## This is the result of trying to write out UTF-8 encoded characters
## with writeLines().  The best solution is to go back and change the
## encoding before saving.  But if you can't do that, you can convert
## this back into Unicode with regular expressions.

## Not very useful...we'd like to convert x back to unicode
## http://stackoverflow.com/questions/17761858/converting-a-u-escaped-unicode-string-to-ascii
x1 <- paste(paste0("\\u",strsplit(gsub("<|>","",x), "U+",fixed=T)[[1]][-1]), collapse="")
x2 <- parse(text = paste0("'", x1, "'"))
x3 <- x2[[1]]
x3

## Another thing you might want to do is use unicode in urls
## It turns out that this is represented differently and needs to be fed into a url correctly as below:

a <- "\u625\u062D\u0635\u0627\u0621"

a  # the arabic

URLencode(a)  # the arabic in url encoding

## Now we can use this to browse web pages

browseURL(paste("https://ar.wikipedia.org/wiki/",a))



## End of exercises for "Acquiring Text"


## Beginning of exercises for "From Text to Data"

## Note, you must run the code from the beginning until here to continue.

##########
## Organize the text for analysis
##########

## After collecting several week's worth of articles from the Al-Ahram archives, we need
## to organize the text for analysis.  

## Use the list.files() command to get a vector of all the filepaths we collected

filepaths <- list.files("ahramArticles", full.names=T, recursive=T)

## We only want the articles, not the indexes. Use the grep() command to
## get just the file paths with the string "article" in them

filepaths <- filepaths[grep("article",filepaths)]

## Bring all of the text into memory by saving them in a list.  This is the empty object
## that we will fill up with the text.

textHolder <- c()

## Then, for each filepath, read it in using readLines() and save it as the i-th element
## of the list using [[ ]].

for(i in 1:length(filepaths)){
  textHolder[[i]] <- readLines(filepaths[i],encoding="UTF-8")
}

## We can name each element of the textHolder object with the filepaths to keep things organized.

names(textHolder) <- filepaths

## Check how many texts we have to work with

length(textHolder)

## Each element of textHolder is an article in raw html
textHolder[[1]]

## We want to change it into just the article text, ready for analysis.
## 1) isolate just the article text
## 2) Remove the html code
## 3) Stem and transliterate the article text

## The arabicStemR library do the stemming for us

library(arabicStemR)

## Example of stemming before we stem our documents

## bring in some text (Text from the Wikipedia page for Statistics)
## with some other strange characters for demonstration.

dat <- paste(readLines("data/statistics.txt",encoding="UTF-8"), collapse=" ")

dat

## stem and transliterate the results

stem(dat)

## Look at each piece of the stemmer code to see what it does
## We can see the function code by putting in the function without ()

stem

## Remove new line characters

dat <- removeNewlineChars(dat)
dat

## Remove punctuation

dat <- removePunctuation(dat)
dat

## remove Diacrtics

dat <- removeDiacritics(dat)
dat

## Remove English numbers

dat <- removeEnglishNumbers(dat)
dat 

## Remove Arabic numbers

dat <- removeArabicNumbers(dat)
dat

## Remove Farsi numbers

dat <- removeFarsiNumbers(dat)
dat

## standarize the Alifs

dat <- fixAlifs(dat)
dat

## clean out Unicode characters not in the Arabic range

dat <- cleanChars(dat)
dat

## clean out Latin characters

dat <- cleanLatinChars(dat)
dat

## remove stopwords

dat <- removeStopWords(dat)$text
dat

## to see what stopwords are removed by default

removeStopWords("a")$arabicStopwordList

## you can also put in a custom stopword list above.

## Remove the prefixes

dat <- removePrefixes(dat)
dat
dat <- removeSuffixes(dat)
dat

## Transliterate

dat <- transliterate(dat)
dat



## Now, proceed with cleaning and stemming our news articles
## make an empty vector to hold the cleaned text

textVector <- c()

## make an empty list to keep a record of what the stemmer did

stemListHolder <- c()

## make an empty vector to hold the url to add to the data set later

urlVector <- c()

## loop over the i elements of textHolder

for(i in 1:length(textHolder)){
   
  ## Print the i counter every 100 iterations so we can track our progres.

  if(i %% 100==0){print(i)}

  ## get the raw html and put it in its own object

  rawHtml <- textHolder[[i]]

  ## save the url
  
  urlVector[i] <- rawHtml[1]

  ## pull out the title, just as we did above for a single article.
  ## Note that if there is no title, we skip to the next article entirely.
  ## Not all article pages actually had article text.

  if(length(grep("bbtitle",rawHtml))==0){next}

  ## Get the title with grep

  title <- rawHtml[grep("bbtitle",rawHtml)]

  ## use regular expressions to remove the html code in < >

  title <- gsub("<.*?>","",title)

  ## trim the white space

  title <- trimws(title)

  ## Get the article text using grep()

  text <- rawHtml[grep("abstractDiv",rawHtml):grep("<!-- AddThis Button BEGIN",rawHtml)]

  ## Remove the html code and other strings we don't want.

  text <- gsub("<!-- AddThis Button BEGIN","",text)
  text <- gsub("<.*?>", "", text)
  text <- gsub("&nbsp;","",text)
  text <- trimws(text)
  text <- text[text!=""]

  ## combine the title and the text into a single string.

  fullText <- paste(c(title,text),collapse=" ")

  ## Stem the arabic string with arabicStemR's stem() function

  stem.out <- stem(fullText,returnStemList=TRUE)

  ## Save the results from the stemmer.  The stemmed text first:

  textVector[i] <- stem.out$text

  ## and the record of what the stemmer did second:

  stemListHolder[[i]] <- stem.out$stemlist
}


## Evaluate what the stemmer is doing

## For the first text, FALSE indicates words that were changed during stemming
## and TRUE indicates words that stayed the same.

table(stemListHolder[[1]]==names(stemListHolder[[1]]))

## How many words were stemmed in some way? TRUE means they were modified somehow

table(unlist(lapply(stemListHolder, function(x){x!=names(x)})))

## How many unique words are there without stemming?

length(unique(names(unlist(stemListHolder))))

## How many unique words are there with stemming?

length(unique(unlist(stemListHolder)))

## So we've gut the number of stems by about 50%


## Get the suffixes and prefixes 
suffixesAndPrefixesHolder <- c()
for(j in 1:length(stemListHolder )){
  print(paste(j,"of",length(stemListHolder )))
  suffixesAndPrefixes <- c()
  if(length(stemListHolder[[j]])==0){next}
  for(i in 1:length(stemListHolder[[j]])){
    stemmedunit <- stemListHolder[[j]][i]
    suffixesAndPrefixes <- c(suffixesAndPrefixes,strsplit(names(stemmedunit),stemmedunit)[[1]])
  }
  suffixesAndPrefixesHolder[[j]] <- suffixesAndPrefixes[-which(suffixesAndPrefixes=="")]
}

## Table the result so we have counts for each prefix and suffixes
suffixesAndPrefixesTab <- sort(table(unlist(suffixesAndPrefixesHolder)))

## List them one by one (because the Arabic names get reversed if we print the table all together)
for(i in 1:length(suffixesAndPrefixesTab)){
  print(suffixesAndPrefixesTab[i])
}
## This prints the number of times each prefix/suffix is removed in the data set






## Look at the resulting object to see what the stemmed documents look like

head(textVector)

## We can see what the stemmer did for the entire document with this object:

stemListHolder[[1]]

## make sure the urls look right

head(urlVector)

## One way to organize the text data is in a data frame
## Make a data frame using the data.frame command, where "filepath" is the
## names of the textholder (which are a unique identifier for each article), and
## "text" is the stemmed text.

dat <- data.frame(filepath = names(textHolder), url = urlVector, text = textVector,stringsAsFactors=F)

## We can add variables to a data frame
## For example, we will use the date that each article was published.  We can get this
## from the filepath using regex

dat$date <- as.Date(gsub("^.*?ahramArticles/|/article.*?$","",dat$filepath))

## Use the date variable to make a new variable with an indicator for each day.
## We will use this later.

dat$day <- as.numeric(dat$date)-min(as.numeric(dat$date))+1

## We can also get the section of the newspaper for each article from the url

dat$section <- gsub("http://www.ahram.org.eg/archive/|/News/.*?aspx$","",urlVector)

## We can specify a variable that is 1 if the section is "Sport" or 0 otherwise.
## We will use this later.

dat$sports <- as.numeric(dat$section=="Sport")


## do we have duplicates?

table(duplicated(dat$text))
table(duplicated(dat$url))

## Yes, remove the duplicates

dat <- dat[duplicated(dat$text)==F,]


## Now we can look at our organized, stemmed data set.
## How many rows and columns does it have?

dim(dat)

## What do the top 6 rows look like?

head(dat)






## An example of how to add bigrams to the dtm

mybigram <- "nad zmalk"

## For the next line, we'll need the stringr package, for the "str_extract_all" command
library(stringr)

## To count a single bigram, you could do this and add the column to the DTM
unlist(lapply(str_extract_all(dat$text,mybigram),length))

## To add a single bigram to all documents, you could do this and paste it to the
## documents before making a DTM.
unlist(lapply(str_extract_all(dat$text,mybigram),paste,collapse=" "))

dat$bigramText <- paste(dat$text, unlist(lapply(str_extract_all(dat$text,mybigram),paste,collapse=" ")))

## if we wanted to learn ALL of the bigrams around a particular stem
## we can do something more complicated

myunigram <- "nad"

textBigrams <- rep(NA, length(dat$text))
for(i in 1:length(dat$text)){
  mydoc <- dat$text[i]
  if(length(grep(myunigram,mydoc))>0){
    bigrams1 <- str_extract_all(mydoc,paste0("[0-9a-zA-z]+ [0-9a-zA-z]*",myunigram,"[0-9a-zA-z]*"))[[1]]
    bigrams2 <- str_extract_all(mydoc,paste0("[0-9a-zA-z]*",myunigram,"[0-9a-zA-z]* [0-9a-zA-z]+"))[[1]]
    bigrams1 <- paste0(sapply(bigrams1,function(x){strsplit(x," ")[[1]][1]}),"-",myunigram)
    bigrams2 <- paste0(myunigram,"-",sapply(bigrams2,function(x){strsplit(x," ")[[1]][2]}))
    textBigrams[i] <- paste(paste(bigrams1, collapse=" "), paste(bigrams2, collapse=" "))
  } else {
    textBigrams[i] <- ""
  }
}
## We can look at what bigrams there are
sort(table(strsplit(paste(na.omit(textBigrams),collapse=" ")," ")[[1]]))
## We could append these to the original documents before making
## the dtm
dat$text2 <- paste(dat$text,textBigrams)
## Then we would make the dtm below with "dat$text2"




##########
## Making a document term matrix 
##########


## Some of the R functions we will use expect the inputs in a different
## format than we created them for the SM.  We will recreate the DTM using
## the tm library.

library(tm)

## Make a corpus out of the stemmed docs

corp <- Corpus(VectorSource(dat$text))
  
## make a document-term matrix.  Note that we don't want much of the 
## pre-processing that the DocumentTermMatrix() can do because those
## tools are not adapted to Arabic.

dtm0 <- DocumentTermMatrix(corp, control = list(tolower=F, stemming = F, stopwords = F, minWordLength = 1,removeNumbers = F))

## Check how many terms there are

dim(dtm0)
# [1]  1107 32119

## Remove the infrequent terms

dtm <- removeSparseTerms(dtm0, 0.96)

## How many terms do we have left?

dim(dtm)

## make it a data frame (but don't do this with large DTMs!)
## You will run out of memory on your computer.
## Save a copy in the original format beforehand.

dtm <- data.frame(as.matrix(dtm))

## move the filepath over to be the rownames so we have a unique identifier
## for earch article.

rownames(dtm) <- gsub("ahramArticles/","",dat$filepath)


## Check if there are any documents with no words left (after 
## removing infrequent words)

table(rowSums(dtm)==0)

## There is one. We need to remove it.

dtm <- dtm[-which(rowSums(dtm)==0),]

## Check the dimensions now

dim(dtm)

## Look at the top again 

dtm[1:10,1:10]



##########
## Text analysis without models
##########

## Word searches

## Find all the texts with a word (in this case "fryQ", meaning "team")

rownames(dtm)[dtm[,"fryQ"]>0]

## Find all the texts with more than 5 instances of the word.

rownames(dtm)[dtm[,"fryQ"]>5]

## See what what the most frequently used words are

wordCounts <- sort(colSums(dtm))
wordCounts

## Plot the most frequently used words 
## (NOTE: plotting Arabic only works on Windows. There is some problem I can't figure out
## with Mac.)

plot(x=rev(rev(wordCounts)[1:20]),y=1:20,xlim=c(500,3500),
  xlab="Word Count",ylab="",axes=F, pch=19)
axis(1)
text(x=rev(rev(wordCounts)[1:20]),y=1:20, pos=2,cex=1.2,
  labels=sapply(gsub("X","",names(rev(rev(wordCounts)[1:20]))), reverse.transliterate))


## Word clouds

## load the wordcloud library

library(wordcloud)

## We can plot just the first 10 documents

wordcloud(dat$text[dat$section=="Sport"][1:10])

## But we want it in arabic so that we can read it

wordcloud(sapply(dat$text[dat$section=="Sport"][1:10],reverse.transliterate),random.order=F)

wordcloud(sapply(dat$text[dat$section=="Sport"][1:10],reverse.transliterate), colors=brewer.pal(8, "Dark2"),random.order=F)


## We can also make a word cloud from the DTM, but it takes some work

## First, create a vector of the word counts we want to plot from the dtm.
## Here, I do everything in a section
wordVec <- colSums(dtm[dat$section=="Sport",])
wordVec

## Then, this for loop creates a string with the words
mystring <- ""
for(i in 1:length(wordVec)){
  mystring <- paste(mystring, paste(rep(gsub("X","",names(wordVec)[i]),wordVec[i]),collapse=" "))
}

## Then we can plot the words, using reverse.transliterate
  
wordcloud(sapply(mystring,reverse.transliterate),random.order=F)



## End of exercises for "From Text to Data".

## Beginning of exercises for "Topic Models".  


## The next section uses the data from al-Ahram.  If you cannot
## scrape the data using the code above, you can download
## the R workspace with the data here:
## http://www.mit.edu/~rnielsen/workspace_for_topic_models.RData
## Then you can load it using the load() command
load("data/workspace_for_topic_models.RData")

##########
## Structural Topic Model
##########

## https://cran.r-project.org/web/packages/stm/vignettes/stmVignette.pdf
library(stm)

## There is an error in the stm textProcessor function, so use the source
## command to get my corrected version (which creates a function called
## textProcessor2).

source("aux_code/corrected_stm_functions.R")

## STM has a function for making a document term matrix that matches 
## the format the stm function expects.

processed <- textProcessor2(dat$text, lowercase = F, meta=dat,
                            removestopwords = F, removenumbers = F, removepunctuation = F,
                            stem = F, wordLengths = c(3, Inf), sparselevel = 1,
                            language = "na")

## We will then run the prepDocuments command to make the DTM, vocabulary vector, and meta data

out <- prepDocuments(processed$documents, processed$vocab, processed$meta,lower.thresh=25)

## You can see each element of this object
## out$documents is a list where each element is a document.
## The first row is a vector indicating a word in the vocabulary (by index)
## and the second row indicates the number of times that word occurs in the 
## document.

out$documents[[1]]

## out$vocab is the vector of words in the vocabulary

head(out$vocab)

## out$meta is the meta data, with the variables we made before

head(out$meta)

## To estimate a topic model, we need to specify K, the number of topics.
## This is a parameter the user must select.

K <- 12

## We set a seed so that we can get the same stm result every time.
## We then run the stm command.

set.seed(1234);  stm.out <- stm(documents = out$documents, vocab = out$vocab,
                                prevalence =~ sports + s(day),
                                K = K,data = out$meta,init.type = "Spectral")

## use the labelTopics command to get key words for the topics

topicLabels <- labelTopics(stm.out,n = 10)

## I like the FREX words best (frequency and exclusivity)
## The rows are the topics, and the columns are the words
## associated with each topic.

topicLabels$frex

## But you probably find it difficult to read my transliterated, stemmed
## arabic.  We might try transliterating back:

reverse.transliterate(topicLabels$frex[1,1])

## That's easier to read. However, the reverse transliterate function isn't
## very handy for working with a matrix (I need to improve it!).

## Also, it is still stemmed and the stems may not make sense.  This function
## uses the stemListHolder object to figure out which original word is the most
## likely match.

source("aux_code/reverseStem.R")

## If we were confused by "nad", we can see:

reverseStem("nad", stemListHolder)

## Translating all of the topic words requires some code using the apply() function.
## Explaining apply is more complicated than I want right now, so just see that it works.

arabicTopicLabels <- apply(topicLabels$frex,1,function(y){
   paste(sapply(y,function(x){reverseStem(x, stemListHolder)}), collapse=";")
   })

## now we can look at the result and get a better sense of what the topics are.

arabicTopicLabels

## We have to label the topics.  There's no way around making some judgement here.
## Better to make it explicitly.
                      
## If you can't tell what the topics are, try reading representative documents.
## Unfortunately, the commands for doing so easily are expecting english documents.
             
topic3Docs <- findThoughts(stm.out, texts = out$meta$text,n = 5, topics = 3)$docs[[1]]

## This is hard to read and reads left-to-right

topic3Docs[1]  

## This doesn't print legibly because R can't handle right-to-left
## text-wrapping

findThoughts(stm.out, texts = out$meta$text,n = 5, topics = 3)$docs[[1]]

## So I've just gotten good at reading my own transliteration.

## Or, you can feed in the filepath instead of the text and go look at the
## original.

findThoughts(stm.out, texts = out$meta$text,n = 5, topics = 5)$docs[[1]]

findThoughts(stm.out, texts = out$meta$url,n = 5, topics = 12)$docs[[1]]


## This is my list of how I labeled the topics.

myTopicLabels <- c("Agriculture",
                   "Regional News",
                   "Business",
                   "Religion",
                   "Swine flu",
                   "Society",
                   "Palestine",
                   "Sport",
                   "Egypt/Provinces",
                   "International",
                   "Journalism",
                   "Advice")
       
## I translated these topics into Arabic (correctly, I hope)

myTopicLabelsArabic <- c("zra30",
                   "aKbar aQlymy0",
                   "altjar0",
                   "aldyn",
                   "anflwnza",
                   "mjtm30",
                   "flsTyn",
                   "ryaD0",
                   "mSr/alm7afZat",
                   "aKbar dwly0",
                   "S7af0",
                   "mWwr0")

myTopicLabelsArabic <- sapply(myTopicLabelsArabic,reverse.transliterate)
    
## We can plot the prevalence of the topics.

plot(stm.out, custom.labels=myTopicLabelsArabic,cex=2)

## We can plot the correlation of the topics
## The first line calculates the correlation.

cormat <- topicCorr(stm.out)

## Then we plot it.  Note that the layout will change if we 
## don't set the seed (it rotates)

set.seed(12);plot(cormat, vlabels=myTopicLabels, vertex.color = "gray92", vertex.label.cex=1.5,
     vertex.size = 200*colSums(stm.out$theta)/sum(colSums(stm.out$theta)))

set.seed(12);plot(cormat, vlabels=myTopicLabelsArabic, vertex.color = "gray92", vertex.label.cex=2,
     vertex.size = 200*colSums(stm.out$theta)/sum(colSums(stm.out$theta)))


## The "perspectives" plot allows us to see how two topics differ
## in the stems they use.

plot(stm.out, type = "perspectives", topics = c(1, 3))

## We can also estimate the correlation of topics with the variables
## for "day" and "sports" we included in the STM model

prep <- estimateEffect(1:K ~ sports + s(day), stm.out, meta = out$meta, uncertainty = "Global")

## This loop plots a figure for each topic showing how it changes over time.
## Note, you may see "(Error in plot.new() : figure margins too large)"

dev.off() ## Shuts down the previous plot
par(mfrow=c(2,6))
for(i in 1:K){
plot(prep, "day", method = "continuous", topics = i,
 model = z, printlegend = FALSE, xlab = "day", ylim=c(0,.3),
 main=paste(myTopicLabels[i],"\n",paste(strsplit(arabicTopicLabels[i],";")[[1]][1:3],collapse="\n")))
}


## Plot the correlation between the "Sports" section and the topics

dev.off() ## Shuts down the previous plot
par(mar=c(5,5,2,2))
plot(prep, covariate = "sports",
  model = stm.out, method = "difference",
  cov.value1 = 1, cov.value2 = 0, 
  labeltype = "custom",
  custom.labels = myTopicLabels, bty="n", xlim=c(-.2,.8),
  main="Correlation of \"Sport\" Section with Topics",
  xlab="Difference in predicted topic proportion")


## Look at the results with different numbers of topics

## Three topics
K <- 3
set.seed(1234);  stm.out.3 <- stm(documents = out$documents, vocab = out$vocab,
                                prevalence =~ sports + s(day),
                                K = K,data = out$meta,init.type = "Spectral")

topicLabels <- labelTopics(stm.out.3,n = 10)
topicLabels$frex
arabicTopicLabels <- apply(topicLabels$frex,1,function(y){
  paste(sapply(y,function(x){reverseStem(x, stemListHolder)}), collapse=";")
})
arabicTopicLabels

## Three topics
K <- 30
set.seed(1234);  stm.out.30 <- stm(documents = out$documents, vocab = out$vocab,
                                  prevalence =~ sports + s(day),
                                  K = K,data = out$meta,init.type = "Spectral")

topicLabels <- labelTopics(stm.out.30,n = 10)
topicLabels$frex
arabicTopicLabels <- apply(topicLabels$frex,1,function(y){
  paste(sapply(y,function(x){reverseStem(x, stemListHolder)}), collapse=";")
})
arabicTopicLabels

## End of exercises for "Topic Models".  

## Beginning of exercises for "Classification".  


###########
## Supervised learning
##########

## We will use functions from the caret library throughout

library(caret)

## Add the indicator for whether the article is in the sports section.
## We will use this as our outcome that we are trying to predict.

dtm$sports <- out$meta$sports

## make vectors of the predictor variable names (also called "features" in
## machine learning).

predictors <- names(dtm)[-which(names(dtm)=="sports")]

## And save the name of the outcome so that we can make our code more general
## in case we want to switch outcomes.

outcome <- "sports"

## split into a training set and a test set
## First, select the size for the training set (the test set will be 
## the remainder).

N <- 250

## Then set a seed (making the random draw replicable) and sample N of the
## rows of the DTM.

set.seed(1234);mysample <- sample(1:nrow(dtm),N,replace=F)

## split the dtm into two parts, one for training and one for testing.

dtmTraining <- dtm[mysample,]
dtmTest <- dtm[-mysample,]

##########
## Linear Prediction with Regression
##########

## One of the simplest but most powerful tools is a linear model.
## Here we pick a single word, "fryQ", and use the count of that word
## in the article to predict whether the article should be in the Sports
## section or not.  We fit using the lm() function in R.

linearPredictorFit <- lm(sports ~ fryQ, dtmTraining)

## We can see the coefficients of the linear model with the summary command.

summary(linearPredictorFit)

## We can see what the predictor is doing on this scatter plot

plot(jitter(dtmTraining$fryQ), jitter(dtmTraining$sports,.2), col="#00000030", pch=19)
abline(linearPredictorFit)
abline(h=.5,lty=3)
abline(v=2.7, lty=3)

## We can see that this matches what we get when we look at a confusion matrix.

## We can make one easily by comparing the fitted values to the actual outcome.

table(linearPredictorFit$fitted.values > .5, dtmTraining[, outcome])

## But with the confusionMatrix function in the caret library, we get more information.
## First, we need to put the predicted values in the "factor" format the function expects.

predicted <- as.factor(as.numeric(linearPredictorFit$fitted.values > .5))

## Then we also put the outcome into that format.

actual <- as.factor(as.numeric(dtmTraining[,outcome]))

## Then we can calculate the confusion matrix with accuracy estimates.

confusionMatrix(predicted, actual)

## This looks very good, but we are predicting to the data that we used to fit
## the model.  There is a large risk of overfitting to the training data.  The
## real test is how well it fits in the test set.

## To predict to the test set, we need to add an Intercept column of all 1s to 
## the DTM.  This will be multiplied by the intercept term in the linear model.

dtmTest[["(Intercept)"]] <- 1

## We predict the value of one observation by multiplying the variable values by
## the coefficients and summing them, as implied by the linear model.

sum(dtmTest[1,names( linearPredictorFit$coeff)] * linearPredictorFit$coeff)

## We would then see if this is below or above .5 (normally).

## To do this calculation for all observations quickly, we use matrix algebra.
## This shows you that we recover the same fitted value for the first observation
## in the test set.

as.matrix(dtmTest[1,names( linearPredictorFit$coeff)]) %*% as.matrix(linearPredictorFit$coeff)

## This gets fitted values for all of the observations in the test set.

linearPrediction1 <- as.matrix(dtmTest[,names( linearPredictorFit$coeff)]) %*% as.matrix(linearPredictorFit$coeff)

## There are many "canned" packages for doing this sort of prediction (such as caret)
## but I think it's important to be able to see the math going on under the hood if 
## you want.

## We can now calculate our confusion matrix as before.

predicted <- as.factor(as.numeric(linearPrediction1 > .5))
actual <- as.factor(as.numeric(dtmTest$sports))
confusionMatrix(predicted, actual)

## This is working fairly well on the test data.  


## What if we had picked a less good predictor variable?
## I illustrate with the word "b3D", meaning "some", which is not
## indicative of sports or not.

linearPredictorFit <- lm(sports ~ b3D, dtmTraining)

## We can see that the coefficient is not different from 0
## and the R-squared is very close to zero.

summary(linearPredictorFit)

## See what the predictions look like in the training set 

predicted <- as.factor(as.numeric(linearPredictorFit$fitted.values > .5))
actual <- as.factor(as.numeric(dtmTraining[,outcome]))
confusionMatrix(predicted, actual)

## The model predicts "not sports" every time, which make sense given
## that the most common category is "not sports," and the word "b3D"
## provides no additional information.
## The accuracy is actually very high.  This is because we get 90% accuracy
## just by always selecting "not sports," but we will have a lot of 
## false negatives.

## Now, we try to improve the accuracy by adding a few more words that 
## seem related to sport from our background knowledge.

linearPredictorFit <- lm(sports ~ fryQ + ryaD + la3b + mbara + ml3b + bTwl, dtmTraining)

## looking at the coefficients, we see the fit is good

summary(linearPredictorFit)

## Notice that not all coefficients are statistically significant, but
## the combined R-squared is very good.

## As before, the real test is how we do in the test sample.

linearPrediction1 <- as.matrix(dtmTest[,names( linearPredictorFit$coeff)]) %*% as.matrix(linearPredictorFit$coeff)

## construct the confusion matrix

predicted <- as.factor(as.numeric(linearPrediction1 > .5))
actual <- as.factor(as.numeric(dtmTest$sports))
confusionMatrix(predicted, actual)

## With very simple tools, we are getting 95% accuracy, though better at predicting
## what won't be in the Sport section than what will. 


##########
## Linear prediction with lasso
##########

## What if we did not want to select from the ~1000 words in the DTM
## because we didn't know what might be the best predictor?

## To see why the linear model fails, let's try

linearPredictorFit <- lm(sports ~ ., dtmTraining)

## If we look at the summary, we see that many coefficients are missing.

summary(linearPredictorFit)

## If we try to predict to a new observation, we get a missing value

sum(dtmTest[1,names( linearPredictorFit$coeff)] * linearPredictorFit$coeff)

## The problem is that we have K >> N, where K is the number of predictors
## and N is the number of observations.  The linear model won't work.

## By the way, we need to make sure to get rid of the intercept variable we added

dtmTest[["(Intercept)"]] <- NULL


## Lasso offers a solution to the problem of K >> N.  We "shrink" most of the coefficients
## to zero and only allow the most informative variables to have a positive coefficient.

## We need the glmnet library

library(glmnet)

## Again, we need to reformat our data in the way this function expects.
## x.1 is the training data dtm, but with the outcome removed
## y.1 is the outcome in the training data
## x.2 is the test data dtm, with the outcome removed
## y.2 is the outcome in the test data

x.1 <- as.matrix(dtmTraining[,-which(names(dtmTraining)=="sports")])
y.1 <- dtmTraining$sports
x.2 <- as.matrix(dtmTest[,colnames(x.1)])
y.2 <- dtmTest$sports

y.1 <- as.character(y.1)
y.2 <- as.character(y.2)

## We fit the lasso on the training data with the normal link function

lasso.1 <- glmnet(y=y.1, x= x.1, family="binomial", nlambda=200)

## We can see how each of the coefficients change as we allow more variables.

plot(lasso.1) 

## We can also print the list of coefficients for a given lamda

plot(sort(coef(lasso.1)[,"s199"]),col="#00000030", pch=19,
  main="Coefficients of variables in the Lasso",
  xlab="variable index",ylab="coefficient size",
  ylim=c(-.1,.2)) 
points(sort(coef(lasso.1)[,"s150"]),col="#FF000010", pch=19) 
points(sort(coef(lasso.1)[,"s50"]),col="#0000FF10", pch=19) 
legend("topleft",pch=c(19,19,19), col=c("#00000030","#FF000030","#0000FF30"),
  legend=paste("lambda =",round(c(lasso.1$lambda[199],lasso.1$lambda[150],lasso.1$lambda[50]),4))
)

## We can get predictions out of the lasso

predict.1 <- predict(lasso.1, newx=x.1, type = "class")

## Because we didn't specify, it gives us predictions at every lambda it used.

colnames(predict.1)

## Here I pick a lambda that predicts perfectly in the training sample.

table(predict.1[,"s150"], dtmTraining$sports)

predicted <- as.factor(as.numeric(predict.1[,"s150"]))
actual <- as.factor(as.numeric(dtmTraining$sports))
confusionMatrix(predicted, actual)

## But as always, the question is how does it do out of sample?

predict.2 <- predict(lasso.1, newx=x.2, type = "class")

predicted <- as.factor(as.numeric(predict.2[,"s150"]))
actual <- as.factor(as.numeric(dtmTest$sports))
confusionMatrix(predicted, actual)

## It does well, but not perfectly.  The prior model was perhaps slightly
## overfitting the data.

## We can try to avoid overfitting using a cross-validation method.

cv.lasso.1 <- cv.glmnet(y=y.1, x= x.1, family="binomial")

## Look at the cross-validated best lambda

plot(cv.lasso.1) # Plot CV-MSPE

## Predict both parts using Training data fit

predict.1.1 <- predict(cv.lasso.1, newx=x.1, s="lambda.min", type = "class")
predict.1.2 <- predict(cv.lasso.1, newx=x.2, s="lambda.min", type = "class")

## Look at the accuracy in the training data

predicted <- as.factor(as.numeric(predict.1.1))
actual <- as.factor(as.numeric(dtmTraining$sports))
confusionMatrix(predicted, actual)

## Look at the accuracy in the test data

predicted <- as.factor(as.numeric(predict.1.2))
actual <- as.factor(as.numeric(dtmTest$sports))
confusionMatrix(predicted, actual)


##########
## Classification with Random Forests
##########


## We specify values for our parameter search.  We're going to try a lot of 
## combinations and see what works best through 10-fold cross validation.

mtryVals <- floor(seq(10, 400, length = 10))
mtryGrid <- data.frame(.mtry = mtryVals)

## This function estimates the random forest classifier parameters.
## The caret library has MANY other models that we could also try.

set.seed(1234)
system.time(
randomForestFit <- train(sports ~ ., data = dtmTraining,
                 method = "rf",
                 tuneGrid = mtryGrid,
                 ntree = 1000,
                 imortance = TRUE,
                 trControl = trainControl(## 10 fold rf
                 				method = "oob",
                 				number = 10,
                 				##repeat 10x
                 				repeats = 10,
                 				classProbs = TRUE,
                 				selectionFunction = "tolerance")
                 )
)  ## End system.time
alarm()


## we can look at the object to learn some about the best fit    

randomForestFit

## Look at our in-sample training set prediction

inSamplePred <- as.numeric(predict(randomForestFit, newdata = dtmTraining) > .5)
confusionMatrix(data = as.factor(inSamplePred), as.factor(dtmTraining[[outcome]]))

## The accuracy looks really good.  But are we overfitting badly?
## Let's predict to the test set to find out.

outOfSamplePred <- as.numeric(predict(randomForestFit, newdata = dtmTest) > .5)

confusionMatrix(data = as.factor(outOfSamplePred), as.factor(dtmTest[[outcome]]))

## This 97% accuracy seems to be about the best we can do on these data.
## Also, 97% accuracy is better than you can possibly do on most interesting 
## classification problems.


##########
## Naive Bayes Classifier (custom, so that you can see the underlying calculations)
##########

## There are many libraries in R that will let you estimate a niave bayes classifier
## but I'm going to show you a custom version so that for at least one method, the
## underlying calculations are clear.

## Rather than loading a library, we need the following two functions.

## naive bayes function

nbayesEstimate <- function(mat, class){   ## This function takes a document-term matrix and a list of document classes
  V <- colnames(mat)
  B <- length(V)
  C <- unique(class)
  ## For each class, calculate the word probabilities given the class
  wordprobs <- matrix(NA,B,length(C))  ## make a matrix  that has as many rows as words and as many columns as classes
  rownames(wordprobs) <- colnames(mat) ## name the rows of the matrix to be the words
  colnames(wordprobs) <- C             ## name the columns of the matrix to be the classes
  for(i in 1:length(C)){   ## For each class..
    sub.mat <- mat[class==C[i],,drop=FALSE]  ## subset the matrix to just the current class
    wordsums <- apply(sub.mat,MAR=2, sum)  ## take the column sums of the sub-matrix
    ## add the laplace prior of +1  
    wordprobs[,i] <- (wordsums + 1)/(sum(sub.mat) + B)  ## calculate the word "probabilities" and store them
  }
  out <- list(wordProbabilities = wordprobs, classProportions = (table(class)/sum(table(class)))[colnames(wordprobs)]) 
  return(out) ## return the list
}

## This is a function for calculating scores given a DTM and the output of nbayes
nbayesPredict <- function(NBdtm, nbayes.obj){
  wordProbabilities <- nbayes.obj$wordProbabilities
  classProportions <- nbayes.obj$classProportions
  holder <- rep(NA,nrow(NBdtm))  ## make a place to hold the output
  names(holder) <- rownames(NBdtm)  ## add document names to the holder
  logprior <- log(classProportions[colnames(wordProbabilities)][1]/classProportions[colnames(wordProbabilities)][2])
  lognbratio <- log(wordProbabilities[,1]/wordProbabilities[,2])  ## calculate the log likelihood ratio for each word
  for(i in 1:nrow(NBdtm)){  ## for each document...
    holder[i] <- logprior + sum(lognbratio*NBdtm[i,])  ## calculate the score, given the words in the document
  }
  return(holder)  ## return the holder
} 



## An example showing what it does
## I am working the example from here: http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html

## First, make an empty matrix that will be our DTM

mat <- matrix(NA,5,6)

## The column names are the words in the documents

colnames(mat) <- c("Chinese","Beijing","Shanghai","Macao","Tokyo","Japan")

## The rownames are the names of the documents

rownames(mat) <- c("1","2","3","4","5")

## These are the word counts of the documents

mat[1,] <- c(2,1,0,0,0,0)
mat[2,] <- c(2,0,1,0,0,0)
mat[3,] <- c(1,0,0,1,0,0)
mat[4,] <- c(1,0,0,0,1,1)
mat[5,] <- c(3,0,0,0,1,1)

## this is the final document-term matrix

mat
matTraining <- mat[1:4,]
matTest <- mat[5,,drop=F]

## this is the vector saying what class they are (about China or not)

class <- c("china","china","china","not")

## We can use the nbayesEstimate() function to estimate the parameters for the classifier

tmp <- nbayesEstimate(matTraining,class)

## Then we use nbayesPredict() to generate predictions

nbayesPredict(matTraining, tmp)

## This is the correct output.
##        1         2         3         4 
## 2.663486  2.663486  2.006706 -0.514568  

## Now apply to the test document
nbayesPredict(matTest, tmp)

## This is the correct output
##        5 
## 0.798991


## To see how it works, look at the internal parts of the functions
## This is the inside of nbayesEstimate:

## SETUP:

## we need to rename out training dtm to match what the internals 
## of the function expect:

mat <- matTraining

## First, we get some information from the "mat" and "class" objects
V <- colnames(mat) ## get the number of words
B <- length(V) ## save the number of words as B
C <- unique(class) ## get a vector of the classes

## For each class, calculate the word probabilities given the class
## make a matrix  that has as many rows as words and as many columns as classes

wordprobs <- matrix(NA,B,length(C))  
  
## name the rows of the matrix to be the words

rownames(wordprobs) <- colnames(mat) 

## name the columns of the matrix to be the classes

colnames(wordprobs) <- C  

## CALCULATION
           
## Loop over each of the classes of documents

#for(i in 1:length(C)){   ## For each class..

## But let's just look at the first class for now
i <- 1

## subset the matrix to just the current class

sub.mat <- mat[class==C[i],,drop=FALSE]  

sub.mat
  
## take the column sums of the sub-matrix

wordsums <- apply(sub.mat,MAR=2, sum)  

wordsums

## We add the laplace prior of +1 to each word in the numerator and
## the denominator, and calculate word probabilities all in one step.

wordprobs[,i] <- (wordsums + 1)/(sum(sub.mat) + B)  

## Note that adding + B in the denominator is equivalent to adding + 1 to each word:

sum(sub.mat) + B

## is the same as this:

sum(colSums(sub.mat)+1)

## Run the whole loop quickly so that we have the word probabilities

for(i in 1:length(C)){   ## For each class..
  sub.mat <- mat[class==C[i],,drop=FALSE]  
  wordsums <- apply(sub.mat,MAR=2, sum)  
  wordprobs[,i] <- (wordsums + 1)/(sum(sub.mat) + B)
}

## Now, look at the internals of the prediction function

## First, we need to rename the Test data to what the function internals expect.

NBdtm <- matTest
NBdtm

## We need the word probabilities calculated in the nbayesEstimate function

wordProbabilities <- wordprobs

wordProbabilities

## We also need the ratio of the class proportions in the training set

classProportions <- (table(class)/sum(table(class)))[colnames(wordprobs)]

classProportions

## Then we set up an object to hold the result

holder <- rep(NA,nrow(NBdtm))  ## make a place to hold the output
names(holder) <- rownames(NBdtm)  ## add document names to the holder

## Calculate the log prior of the class proportion ratio

logprior <- log(classProportions[colnames(wordProbabilities)][1]/classProportions[colnames(wordProbabilities)][2])

logprior  ## Prior odds are that the class is "china"

## then calculate the log ratio of the word probabilities

lognbratio <- log(wordProbabilities[,1]/wordProbabilities[,2])  ## calculate the log likelihood ratio for each word

lognbratio

## Note that these are positive for the words that indicate class 1
## and negative for the words that indicate class 2.

## Then, for each row of the new dtm, we do multiply the word count in the new DTM
## by the log ratio

lognbratio*NBdtm[1,]

## and take the sum of it.

sum(lognbratio*NBdtm[1,])

## At this point you can see that the score is negative, so if we had
## no prior about the class proportions, we would assume even odds.
## That would be 50% each class, and log(.5/.5) == 0.  So we would 
## predict "not".

## But we do have informative prior odds, and we add them

logprior + sum(lognbratio*NBdtm[1,])

## So the predicted class is "china"

## END worked example of the internals of the Naive Bayes classifier function.




## Now we are ready to apply the naive bayes classifier to the newspaper data

myNB <- nbayesEstimate(dtmTraining[,predictors],as.character(dtmTraining[[outcome]]))

## we can see what words matter in the model this way

sort(myNB$wordProbabilities[,2]-myNB$wordProbabilities[,1])

## Generate in-sample predictions for the Traning set

insamplePred <- nbayesPredict(dtmTraining[,predictors], myNB)

## we can look at what the classifier does by plotting the numeric
## predictions agains the real outcome

plot(insamplePred, dtmTraining[[outcome]], col="#00000030",pch=19, 
  xlab="Naive Bayes numerical prediction",
  ylab="Real outcome",main="Training data")
abline(v=0,lty=3)

## Calculate an in-sample confusion matrix as before

predicted <- as.factor(as.numeric(insamplePred < 0))
actual <- as.factor(dtmTraining[[outcome]])
confusionMatrix(predicted, actual)

## Calculate out of sample predictions

outsamplePred <- nbayesPredict(dtmTest[,predictors], myNB)

## Plot the predicted vs. actual in the Test set

plot(outsamplePred, dtmTest[[outcome]], col="#00000030",pch=19,
  xlab="Naive Bayes numerical prediction",
  ylab="Real outcome",main="Test data")
abline(v=0,lty=3)
table(dtmTest[[outcome]], outsamplePred < 0)

## Calculate the confusion matrix for the Test data

predicted <- as.factor(as.numeric(outsamplePred < 0))
actual <- as.factor(dtmTest[[outcome]])
confusionMatrix(predicted, actual)

## We can change the false negatives and positives by 
## changing the prior on the categories

myNB$classProportions[1] <- .3
myNB$classProportions[2] <- .7
outsamplePred <- nbayesPredict(dtmTest[,predictors], myNB)

predicted <- as.factor(as.numeric(outsamplePred < 0))
actual <- as.factor(dtmTest[[outcome]])
confusionMatrix(predicted, actual)


## End of exercises for "Classification".  

## Beginning of exercises for "Visualization".

##########
## Plotting Arabic Text in R (Windows Only -- Does not work on Mac)
##########

## An important part of research is communicating your model and your results.
## We've already seen a number of plots.  Here is one that I will build piece by
## piece to show which words matter most in the Naive Bayes classifier.

## These differences in the two colums of the naive bayes output show which
## words matter most in the classifier.  But putting them in a table isn't very 
## easy to read.

wordDiffs <- sort(myNB$wordProbabilities[,2]-myNB$wordProbabilities[,1])

## This tells me which words are most frequent in all the articles, but again,
## hard to read in a table.

wordPrevalence <- colSums(dtm[,names(wordDiffs)])/sum(colSums(dtm[,names(wordDiffs)]))

## For a simple plot, I plot the stems with the x-axis showing how much the word
## matters and the y-axis random uniform numbers to spread the words out.

set.seed(123456);yvals <- runif(1:length(wordDiffs),0,1)
plot(x=wordDiffs,y=yvals,type="n")
text(x=wordDiffs,y=yvals,labels=names(wordDiffs))

## We can improve the figure by adding better labels
plot(x=wordDiffs,y=yvals,type="n",xlim=c(-.01,.025),
  xlab="",
  ylab="", axes=F,
  main="Which words predict Sports articles?",
  bty="n");axis(1)
abline(v=0,lty=3)
text(0,-.2,"Word weights for predicting \"Sports\"\nLess Sports ............. More Sports", xpd=NA)
text(x=wordDiffs,y=yvals,labels=names(wordDiffs),xpd=NA)

## We don't need all the words in the middle, so we can make them
## smaller and lighter using transparency.

wordTransparency <- round(abs(wordDiffs)/max(abs(wordDiffs)) *100,0)+30
wordTransparency[wordTransparency>99] <- 99
wordTransparency <- paste0("#000000",wordTransparency)

wordCex <- abs(wordDiffs)/max(abs(wordDiffs))*3

plot(x=wordDiffs,y=yvals,type="n",xlim=c(-.01,.025),
  xlab="",
  ylab="", axes=F,
  main="Which words predict Sports articles?",
  bty="n");axis(1)
abline(v=0,lty=3)
text(0,-.2,"Word weights for predicting \"Sports\"\nLess Sports ............. More Sports", xpd=NA)
text(x=wordDiffs,y=yvals,labels=names(wordDiffs),xpd=NA,col=wordTransparency,cex=wordCex)


## Now, we want to put the words in Arabic so we can actually read them.

arabicWords <- sapply(names(wordDiffs),reverseStem,stemListHolder)

plot(x=wordDiffs,y=yvals,type="n",xlim=c(-.01,.025),
  xlab="",
  ylab="", axes=F,
  main="Which words predict Sports articles?",
  bty="n");axis(1)
abline(v=0,lty=3)
text(0,-.2,"Word weights for predicting \"Sports\"\nLess Sports ............. More Sports", xpd=NA)
text(x=wordDiffs,y=yvals,labels=arabicWords,xpd=NA,col=wordTransparency,cex=wordCex)


## We could try resizing the words so that the size shows how prevalent they
## are in the corpus, rather than how predictive they are (because we can
## already see that from the x-axis location).

wordCex2 <- colSums(dtm[,names(wordDiffs)])/sum(colSums(dtm[,names(wordDiffs)]))*200+.5
hist(wordCex2)


plot(x=wordDiffs,y=yvals,type="n",xlim=c(-.01,.025),
  xlab="",
  ylab="", axes=F,
  main="Which words predict Sports articles?",
  bty="n");axis(1)
abline(v=0,lty=3)
text(0,-.2,"Word weights for predicting \"Sports\"\nLess Sports ............. More Sports", xpd=NA)
text(x=wordDiffs,y=yvals,labels=arabicWords,xpd=NA,col=wordTransparency,cex=wordCex2)


## Perhaps we want Arabic labels too.

plot(x=wordDiffs,y=yvals,type="n",xlim=c(-.01,.025),
  xlab="",
  ylab=reverse.transliterate("tkrr alklm0"), axes=F,
  main=reverse.transliterate("alklmat alty ttnba balmQalat alryaDy0"),
  bty="n");axis(1)
abline(v=0,lty=3)
text(0,-.2,reverse.transliterate("mzyd mn alryaD0 ............. aQl mn alryaD0"), xpd=NA)
text(x=wordDiffs,y=yvals,labels=arabicWords,xpd=NA,col=wordTransparency,cex=wordCex2)


## if we want to save this, we'll use png format (because the R pdf creator doesn't play
## nice with Unicode).

png("code/fig1.png",400,400)
plot(x=wordDiffs,y=yvals,type="n",xlim=c(-.01,.025),
  xlab="",
  ylab=reverse.transliterate("tkrr alklm0"), axes=F,
  main=reverse.transliterate("alklmat alty ttnba balmQalat alryaDy0"),
  bty="n");axis(1)
abline(v=0,lty=3)
text(0,-.2,reverse.transliterate("mzyd mn alryaD0 ............. aQl mn alryaD0"), xpd=NA)
text(x=wordDiffs,y=yvals,labels=arabicWords,xpd=NA,col=wordTransparency,cex=wordCex2)
dev.off()

## End of exercises for "Visualization".  



## Begin Twitter tutorial

# https://www.r-bloggers.com/setting-up-the-twitter-r-package-for-text-analytics/
## Saved at aux_code/Setting up the Twitter R package for text analytics.pdf

## load libraries
library(twitteR)
library(longurl)

## SKIP DOWN TO WHERE I LOAD OBJECTS

## set up twitter authentication (following instructions from aux_code/Setting up the Twitter R package for text analytics.pdf)
consumer_key <- "YOURS HERE"
consumer_secret <- "YOURS HERE"
access_token <- "YOURS HERE"
access_secret <- "YOURS HERE"

## the set up command
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)

## started exploring what I can get.

## The getUser() function pulls data on a specific user
tuser <- getUser("saaidnet1")
tuser$name
tuser$description
tuser$created
tuser$location

## I can get some of the timeline with the userTimeline() command
ut <- userTimeline('saaidnet1', n=10)

## This line (commented out) gets all of the tweets.
## I've saved the object from when I collected these tweets and it loads in the line below.

#ut <- userTimeline('saaidnet1', n=1000)

#saveRDS(ut,"tweets.rds")
ut <- readRDS("data/tweets.rds")

## Look at the first tweet
ut[[1]]

## Look at the names of the first tweet object
names(ut[[1]])

## There is a lot of information there.

## I don't use it in this tutorial, but in the past, I needed to expand
## the urls in tweets.  This is how.
expand_urls("https://t.co/RDtzWOIQb0")

## Process the tweets

## Load the stemmer library if we haven't already
library(arabicStemR)

## Make empty holders for all of the variables we will collect
stemmedTweetHolder <- rep(NA,length(ut))
retweetHolder <- rep(NA,length(ut))
favoriteHolder <- rep(NA,length(ut))
idHolder <- rep(NA,length(ut))
urlHolder <- c()

## Loop over the tweets, adding the information to each data holder.
for(i in 1:length(ut)){
  stemmedTweetHolder[i] <- stem(ut[[i]]$text)
  retweetHolder[i] <- ut[[i]]$retweetCount
  favoriteHolder[i] <- ut[[i]]$favoriteCount
  idHolder[i] <- ut[[i]]$id
  urlHolder[[i]] <- ut[[i]]$urls
}

## combine into meta data
dat <- data.frame(myid=1:length(idHolder),id=idHolder,text=stemmedTweetHolder,rt=retweetHolder,fv=favoriteHolder)

## Look at the meta data
head(dat)

## Topic model
library(stm)

source("aux_code/corrected_stm_functions.R")

## process for text analysi
processed <- textProcessor2(dat$text, lowercase = F, meta=dat,
                            removestopwords = F, removenumbers = F, removepunctuation = F,
                            stem = F, wordLengths = c(1, Inf), sparselevel = 1,
                            language = "na")

out <- prepDocuments(processed$documents, processed$vocab, processed$meta,lower.thresh=1)

## Use 10 topics for now.
K <- 10

## Estimate the topic model
set.seed(1234);  stm.out <- stm(documents = out$documents, vocab = out$vocab,
                                prevalence =~ rt + fv,
                                K = K,data = out$meta,init.type = "Spectral")

## Label the topics
topicLabels <- labelTopics(stm.out,n = 10)

topicLabels$frex


arabicTopicLabels <- apply(topicLabels$frex,1,function(y){
  paste(sapply(y,function(x){reverse.transliterate(x)}), collapse=";")
})

findThoughts(stm.out, texts = out$meta$url,n = 5, topics = 12)$docs[[1]]

plot(stm.out, type = "perspectives", topics = c(1, 3))


## Look at topic correlations
cormat <- topicCorr(stm.out)

## Then we plot it.  Note that the layout will change if we 
## don't set the seed (it rotates)

set.seed(12);plot(cormat,vertex.size = 200*colSums(stm.out$theta)/sum(colSums(stm.out$theta)))
#, vlabels=myTopicLabels, vertex.color = "gray92", vertex.label.cex=1.5,
#                  vertex.size = 200*colSums(stm.out$theta)/sum(colSums(stm.out$theta)))


prep <- estimateEffect(1:K ~ rt + fv, stm.out, meta = out$meta, uncertainty = "Global")

plot(prep)

dev.off() ## Shuts down the previous plot

## Plot which topics are correlated with the number of retweets
par(mar=c(5,5,2,2))
plot(prep, covariate = "rt",
     model = stm.out, method = "difference",
     cov.value1 = 1, cov.value2 = 0, 
     #labeltype = "custom",
     #custom.labels = myTopicLabels, 
     bty="n",
     main="Saaid.net Topics predict Retweets",
     xlab="Difference in predicted topic proportion")



## Can we build a predictive model of retweets and favorites?

## Check if retweets and favorites are related
summary(lm(rt~fv,dat))
## Yes, strongly correlated

## make a dtm
library(tm)

corp <- Corpus(VectorSource(dat$text))
dtm0 <- DocumentTermMatrix(corp, control = list(tolower=F, stemming = F, stopwords = F, minWordLength = 1,removeNumbers = F))

dim(dtm0)
# [1]  1107 32119

## Remove the infrequent terms

dtm <- removeSparseTerms(dtm0, 0.995)

## How many terms do we have left?

dim(dtm)
## [1] 971 277

## make it a data frame 

dtm <- data.frame(as.matrix(dtm))

## move the filepath over to be the rownames so we have a unique identifier
## for earch article.

rownames(dtm) <- dat$id


## Check if there are any documents with no words left (after 
## removing infrequent words)

table(rowSums(dtm)==0)

## There are 10. We need to remove it.

dtm <- dtm[-which(rowSums(dtm)==0),]

## Check the dimensions now

dim(dtm)
# [1] 961 277

## Look at the top again 

dtm[1:10,1:10]

rownames(dat) <- dat$id
metaData <- dat
dim(metaData)

## subset to just the ones we have text for
metaData <- metaData[rownames(dtm),]

dim(metaData)
# [1] 961   5

## combine retweets and favorites into a single outcome measure
dtm$rtfv <- metaData$fv+metaData$rt


## Now try making a predictive model
## this regresses retweets+favorites on all of the words individual
mod1 <- lm(rtfv ~ ., dtm)
summary(mod1)

## we can see which words have the largest coefficients 
sort(mod1$coeff)

## Make a predictive model with the topics as predictors

## get the topic proportions out of the stm model
names(stm.out)
topicProportions <- stm.out$theta

colnames(topicProportions) <- paste0("Topic",1:K)

rownames(topicProportions) <- out$meta$id

## combine this with the metaData object
metaData2 <- cbind(metaData,topicProportions[rownames(metaData),])

head(metaData2)
names(metaData2)
metaData2$rtfv <- metaData2$fv+metaData2$rt

## estimate retweets+favorites as a function of topics (leave one out as a baseline)
myFormula <- paste("rtfv ~",paste(paste0("Topic",2:K),collapse=" + "))
mod2 <- lm(myFormula, metaData2)
summary(mod2)


## END Tutorial





