## Illustrate data collection and analysis from Islamic sources online
## Rich Nielsen
## Prepared for "In their words | In our words" conference at ANU, 11/23/2017

## About this file:
## This file is a script containing computer code to collect and analyze data
## from the website saaid.net.

## You will notice that many lines begin with "##".  The # sign "comments out"
## a line of the script so that the computer will not try to execute that line.
## Lines that are commented out are comments for the users.  Lines that are not
## commented out are code for the computer.

## To run this script, you will need to install the free programming language R.
## 1) go to https://www.r-project.org/
## 2) Select "download R" which should lead you here: https://cran.r-project.org/mirrors.html
## 3) Select your preferred source for the code.  I prefer 0-cloud, 
##    which leads you here: https://cloud.r-project.org/
## 4) Select "Download R for [your operating system]"
## 6) Then click "install R for the first time" on the next page.
## 7) Then click "Download R [Version]"
## 8) Then follow operating specific methods for installing new programs.  In Windows,
##    this means downloading the .exe file, then doubleclicking it. I recommend installing
##    with all of the defaults.

## Open R.  
## You will want to install new add-ons as you go, so open it with permissions
## from your operating system to make changes.  In Windows, this means right-clicking on
## the R icon and selecting "Run as administrator."  Select yes in the dialogue
## box that opens, then wait for R to open.

## R orientation.
## You should see a window with a gray backround, and a box within it that is labeled
## "R Console" at the top.  This is where the output appears.  When you open R, it will
## tell you what version of the program you are using and provide some other (mostly 
## extraneous) information.  At the bottom of the text, you should see a red ">".  This
## is where you can type in commands.

## If you have never programmed before, then welcome!  It is tradition that your
## first line of code in a new programming language be to print "hello world".  
## Put the cursor at the red ">" and type the following and press enter to execute it:
print("hello world")

## You should see the result:
# [1] "hello world"

## Typing commands into the interface is not practical.  Instead, use a script such
## as this one.  To open this script in R, click "File > Open Script" and then
## navigate to where this script is saved and select it.  It will open in R.

## You can execute part or all of this script now that you have opened it in R.
## Highligh the code below and then press the keys Ctrl+R (in Windows) or 
## Cmd+Enter (in Mac):

print("hello world")

## You should see the same result as before.

## I like to save information about R, so I use this command:
sessionInfo()
## ...and then save the results in my script.

# R version 3.4.2 (2017-09-28)
# Platform: x86_64-w64-mingw32/x64 (64-bit)
# Running under: Windows 7 x64 (build 7601) Service Pack 1
# 
# Matrix products: default
# 
# locale:
# [1] LC_COLLATE=English_United States.1252 
# [2] LC_CTYPE=English_United States.1252   
# [3] LC_MONETARY=English_United States.1252
# [4] LC_NUMERIC=C                          
# [5] LC_TIME=English_United States.1252    
# 
# attached base packages:
# [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# loaded via a namespace (and not attached):
# [1] compiler_3.4.2


## The R base code can be supplemented with add-on packages.
## We will install and use several of these.

##########
## Libraries and preliminaries
##########

## Note: install.packages commands commented out because they only need to 
##   be installed once.  Uncomment to install.

#install.packages("tm")
library(tm)
packageVersion("tm") #0.7.1
packageVersion("NLP") # 0.1.11
#install.packages("stm")
library(stm) # stm v1.3.0 (2017-09-08) successfully loaded. See ?stm for help.
library(Matrix)
packageVersion("Matrix") # 1.2.11
library(stringr) ## str_count()
packageVersion("stringr") # 1.2.0
#install.packages("arabicStemR")
library(arabicStemR)
packageVersion("arabicStemR") # 1.2
#install.packages("igraph")
library(igraph)
packageVersion("igraph") # 1.1.2

##########
## set the working directory
##########

## We will use a few saved files, so we need to tell R where to got
## looking for them.  We do this by setting the working directory.

## In my case, the files are here:

mydir <- "C:/Users/Richard Nielsen/Desktop/Professional Stuff/ANU master class/MasterClassANU/"

## ... but you should not use this line -- it only works on my computer.

## Fill in the complete file path to where you saved the files in
## "MasterClassANU.zip"

mydir <- "your/path/to/files/here"

## Then set the working directory.
setwd(mydir)



##########
## Collecting data by "scraping"
##########

## first, we will begin by collecting just a few documents to analyze.

## Let's start with English-language documents, so everyone can follow.
## We will extract the question and answer from this fatwa:
## http://en.islamway.net/fatwa/72380
html <- readLines("http://en.islamway.net/fatwa/72380", encoding="UTF-8")
## We can look at the result by typing the name of the object we just created:
html

## The resulting object "html" is a vector where each line contains a line
## from the html of the webpage.  NOTE: the object could be called anything.
## Name your objects useful names.

## Now, we want to use the html code to extract the question and the answer.
## The easiest way to see where to extract is to "View page source" in your
## internet browser and then look for the html code around the text you want.

## In this case, we see that the question is preceeded by
## "<strong class="theme-color">Question</strong>:" and ends with "</div>"
## and that the answer is preceeded by
## <strong class="theme-color">Answer</strong>: and ends with </div>
## We will use this to extract the question and the answer from the rest of 
## the web page

## First, we will make the html into one single line
html <- paste(html, collapse=" ")
html
## Then, we will split out the Answer using string manipulations.
## First, cut off everything before the Answer
tmp <- strsplit(html,"Answer</strong>:")[[1]][2]
tmp
## Then cut off everything after the Answer
tmp <- strsplit(tmp,"</div>")[[1]][1]
tmp

## This is now the chunk we want, but there are still html tags in it
## We can remove them with a regular expression:
tmp <- gsub("<.*?>"," ", tmp)
tmp
## Now, all we want to get rid of are the leading and trailing spaces
tmp <- str_trim(tmp)
tmp  ## this is now the piece we wanted.

## Looking at the result, we see that there are still some weird things
## we need to clean up in the text: &rsquo;, &#39;, &nbsp;, etc.
## For now, we'll replace them using a series of regular expressions.
tmp <- gsub("&rsquo;|&#39;","'", tmp) 
tmp <- gsub("&rdquo;","''", tmp) 
tmp <- gsub("&lsquo;","`", tmp) 
tmp <- gsub("&ldquo;","''", tmp) 
tmp <- gsub("&nbsp;"," ", tmp)
tmp <- gsub("&ndash;"," ", tmp)
tmp

## Now we do the same for the question, but...
## Any time we start doing repetitive things though, we might want to make
## a function to do them automatically.

cleanHtml <- function(input, # some html from an English-language fatwa on islamway
                     split1, # the first piece of text at which to split
                     split2  # the second piece of text at which to split
                           ){
  tmp <- paste(input, collapse=" ")
  ## First, cut off everything before split1
  tmp <- strsplit(tmp,split1)[[1]][2]
  ## Then cut off everything after split 2
  tmp <- strsplit(tmp,split2)[[1]][1]
  ## remove html tags
  tmp <- gsub("<.*?>"," ", tmp)
  ## remove leading and trailing spaces
  tmp <- str_trim(tmp)
  ## clean up weird characters
  tmp <- gsub("&rsquo;|&#39;","'", tmp) 
  tmp <- gsub("&rdquo;","''", tmp) 
  tmp <- gsub("&lsquo;","`", tmp) 
  tmp <- gsub("&ldquo;","''", tmp) 
  tmp <- gsub("&laquo;","''", tmp)
  tmp <- gsub("&raquo;","''", tmp)
  tmp <- gsub("&nbsp;"," ", tmp)
  tmp <- gsub("&ndash;"," ", tmp)
  tmp <- gsub("&acirc;","a", tmp)
  tmp <- gsub("&icirc;","i", tmp)
  tmp <- gsub("&ucirc;","u", tmp)
  tmp <- gsub("&zwnj;","",tmp)
  ## remove diacritics and translate any Arabic 
  ## (we will do more aggressive things with Arabic later)
  tmp <- removeDiacritics(transliterate(tmp))
  ## return the result
  return(tmp)
}

## Test our function on the answer, to make sure it works there
cleanHtml(input = html, split1 = "Answer</strong>:", split2 = "</div>")

## Now, use our function to get the question
cleanHtml(input = html, split1 = "Question</strong>:", split2 = "</div>")

## ...and we can also get the number of views
cleanHtml(input = html, split1 = "Views Count:", split2 = "</div>")

## or the number of upvotes and downvotes
cleanHtml(input = html, split1 = "<span id=\"up-votes\">", split2 = "</span>")
cleanHtml(input = html, split1 = "<span id=\"down-votes\">", split2 = "</span>")


## We can use this process we've developed to get the questions and 
## answers for more fatwas:
## I'm taking the top fatwas from this list:
## http://en.islamway.net/fatawa/top?criterion=views

fatwaLinks <- c("63098","20007","20005","72383","72686",
                "72380","72708","20008","54053","63780")

## Notice that I just got the numbers.  That's because I notice that each  
## link has a consistent pattern, and only the number changes.

## I will use a loop to iterate over these fatwa links without having to
## go to each one manually. To illustrate how a loop works, here is a loop
## that simply prints each element of fatwaLinks in turn.

for(i in 1:length(fatwaLinks)){
  print(fatwaLinks[i])
}   

## We want a loop that collects the items in each fatwa and puts them
## in a convenient format for us to analyze.

## First, we make an object that will hold the results.
## We create an object called a data frame, with a row for each document
## and a column for each piece of information.

fatwadat <- as.data.frame(matrix(NA,10,5))
fatwadat
## Let's make the column names match the info
colnames(fatwadat) <- c("question","answer","views","upvotes","downvotes")
## Let's make the row names match the fatwa link ids
rownames(fatwadat) <- fatwaLinks
fatwadat

## Now, we loop over the links, get the html, and fill in the info

for(i in 1:length(fatwaLinks)){
  mylink <- fatwaLinks[i]
  print(mylink)
  ## get the link html (pasting the link number to the link base)
  html <- readLines(paste0("http://en.islamway.net/fatwa/",mylink), encoding="UTF-8")
  ## get the question and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,1] <- cleanHtml(input = html, split1 = "Question</strong>:", split2 = "</div>")
  ## get the answer and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,2] <- cleanHtml(input = html, split1 = "Answer</strong>:", split2 = "</div>")
  ## get the views and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,3] <- cleanHtml(input = html, split1 = "Views Count:", split2 = "</div>")
  ## the number of upvotes and downvotes and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,4] <- cleanHtml(input = html, split1 = "<span id=\"up-votes\">", split2 = "</span>")
  fatwadat[mylink,5] <- cleanHtml(input = html, split1 = "<span id=\"down-votes\">", split2 = "</span>")
}

## We can inspect the whole data frame
fatwadat

## Or just a single element of it (indexed as [row, column])
fatwadat[1,1]

## Or pull out a row
fatwadat[1,]

## Or pull out a column
fatwadat[,1]

##########
## What makes a fatwa popular?
##########

## Now, we get to something a little more analytical.  
## Let's see what topics get more views.

## first, we collect ALL of the english fatwas

## We can get the fatwa links on a page using string manipulations
## First, get the html
html <- readLines("http://en.islamway.net/fatawa/new?page=1")
## paste the html into a single line
html <- paste(html,collapse=" ")
## get the fatwa links
tmp <- strsplit(html,"href=\"/fatwa/")[[1]]
## Look at the intermediate result
tmp
## get rid of the first element
tmp <- tmp[-1]
## look at the intermediate result
tmp
## Then, we loop over each of the links in tmp to get just the number
for(i in 1:length(tmp)){
  tmp[i] <- strsplit(tmp[i],"/")[[1]][1]
}
## look at the result
tmp

## But we want to do this for each of the 13 pages of English-language fatwas
## so we need an outer loop over each page

linkList <- c()  ## an empty object to hold the links

## Loop over 13 pages (does the same as above for each page)
for(j in 1:13){
  print(j)
  ## get the html
  html <- readLines(paste0("http://en.islamway.net/fatawa/new?page=",j))
  ## paste the html into one line
  html <- paste(html,collapse=" ")
  ## get the fatwa links
  tmp <- strsplit(html,"href=\"/fatwa/")[[1]]
  ## get rid of the first element
  tmp <- tmp[-1]
  ## Then, we loop over each of the links in tmp to get just the number
  for(i in 1:length(tmp)){
    tmp[i] <- strsplit(tmp[i],"/")[[1]][1]
  }
  ## add the result to the linkList object
  linkList <- c(linkList, tmp)
}

## look at the result
linkList
## how long is the result?
length(linkList)

## Get rid of any duplicates
linkList <- unique(linkList)
length(linkList) #252

## Now, we can do our loop from before, but over all 252 of the english language fatwas
fatwaLinks <- linkList

## It's generally a good idea to get all of the html and save it.  You never know what
## you'll want later

## An empty holder for the html
htmlholder <- c()

## This loop is commented out because it takes a long time to run
## and I've saved the resulting object below.

#for(i in 1:length(fatwaLinks)){
  mylink <- fatwaLinks[i]
  print(paste("link",i,":",mylink))
  ## get the link html (pasting the link number to the link base)
  htmlholder[[i]] <- readLines(paste0("http://en.islamway.net/fatwa/",mylink), encoding="UTF-8")
}
## add the link names to the htmlholder object
names(htmlholder) <- fatwaLinks

## This line saves the object
#saveRDS(htmlholder,"htmlholder.rds")
## This line loads the object from where it is saved
htmlholder <- readRDS("htmlholder.rds")

## Now, we loop over the html, get the html, and fill in the info
fatwadat <- as.data.frame(matrix(NA,length(fatwaLinks),6))
## look at the result
fatwadat
## Let's make the column names match the info
colnames(fatwadat) <- c("question","answer","views","upvotes","downvotes","date")
## Let's make the row names match the fatwa link ids
rownames(fatwadat) <- fatwaLinks
## look at the result
fatwadat

## Loop over each element of htmlholder, extract the data we want,
## and place it in fatwadat

for(i in 1:length(htmlholder)){
  html <- htmlholder[[i]]
  mylink <- names(htmlholder)[i]
  ## get the question and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,1] <- cleanHtml(input = html, split1 = "Question</strong>:", split2 = "</div>")
  ## get the answer and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,2] <- cleanHtml(input = html, split1 = "Answer</strong>:", split2 = "</div>")
  ## get the views and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,3] <- cleanHtml(input = html, split1 = "Views Count:", split2 = "</div>")
  ## the number of upvotes and downvotes and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,4] <- cleanHtml(input = html, split1 = "<span id=\"up-votes\">", split2 = "</span>")
  fatwadat[mylink,5] <- cleanHtml(input = html, split1 = "<span id=\"down-votes\">", split2 = "</span>")
  ## add the date
  fatwadat[mylink,6] <- cleanHtml(input = html, split1 = "time datetime=\"", split2 = " ")
}

## We can inspect the whole data frame
fatwadat

## Now we can look at the page views for each fatwa
## first, we need to get rid of the comma and make the strings into numbers
fatwadat$views <- as.numeric(gsub(",","",fatwadat$views))
## We can look at summary statistics
summary(fatwadat$views)
## We can look at a histogram of the number of page views
hist(fatwadat$views)

##########
## Make a Document-Term Matrix
##########

## First, we combine the questions and the answers by pasting them together.
fatwatext <- paste(fatwadat[,1],fatwadat[,2])
## look at the result
fatwatext[1]

## Then, let's replace a punctuation mark that becomes a problem
## in the common word "Qur'aan": '
fatwatext <- gsub("'","",fatwatext)

## make a corpus that combines the Question and Answer into a single document
corp <- Corpus(VectorSource(fatwatext))
## Look at the result
corp
## Look at the text for a single element
corp[[1]]$content

## make a document-term matrix
dtm <- DocumentTermMatrix(corp, 
         control = list(tolower=T, stemming = T, stopwords = T, minWordLength = 2,
         removeNumbers = T, removePunctuation = T))

## check the dimensions of the dtm
dim(dtm)

## make the text data substantially less sparse
dtm <- removeSparseTerms(dtm, 0.98)
## check the new dimensions
dim(dtm)

## make it a data frame
dtm <- as.data.frame(as.matrix(dtm))

## add the document names as rownames
rownames(dtm) <- rownames(fatwadat)

## Look at the most frequent terms
sort(apply(dtm,2,sum))

## Look at how the dtm represents a document: fatwa on reading the quran on your phone
dtm["72380",]

## now, we sort to see what the most common words are
sort(dtm["72380",])

## make sure all the docs have words left (remember, we got rid of rare words)
doclen <- apply(dtm,1,sum)
min(doclen) ## should be more than 0.  If not, there will be problems later.

##########
## Structural Topic Model: What topics correlate with fatwa popularity?
##########

## make a variable indicating the popular documents

## figure out the rate of page views for each document
## 1) First, figure out how many days since publication
fatwadat$daysSincePublication <- as.numeric(as.Date("2017-11-05")-as.Date(fatwadat$date))
## 2) Then calculate how many page views per day
fatwadat$viewRate <- fatwadat$views/fatwadat$daysSincePublication
## Look at a histogram of the rate of page views
hist(fatwadat$viewRate)
## Make a scatter plot of the page view count and the page view rate
plot(fatwadat$views,fatwadat$viewRate)

## compare the top 10% of documents to the rest
quantile(fatwadat$viewRate,.90)

## make a variable that is "1" if a document is popular and "0" otherwise
fatwadat$popular <- as.numeric(fatwadat$viewRate > quantile(fatwadat$viewRate,.90))
table(fatwadat$popular) ## Not quite top 25, but close enough

## We make an object called "treatvec" that we'll add to a "meta-data" object for 
## the structural topic model.
treatvec <- fatwadat$popular 
## make the meta-data object
meta2 <- as.data.frame(treatvec)
## the stm() function wants the DTM in a different format, so the Matrix() command helps:
dtmM <- Matrix(as.matrix(dtm))
## This preproccesses the text for the stm() function
processed <- readCorpus(dtmM, type="dtm")
## This prepares everything for the stm() function
out <- prepDocuments(processed$documents, processed$vocab, meta2)
## We have to pick a number of topics to estimate.
## I'm arbitrarily picking 15.  We will get different results with different
## numbers of topics
K <- 15
## Estimate the stm model (while timing it)
system.time({
set.seed(1234);stm.out <- stm(out$documents, out$vocab, K=K, prevalence=~treatvec,data=out$meta)
})

## look at the 5 words most associated with each topic
lab <- labelTopics(stm.out,n=5)
## look at the result
lab
## get the FREX words and make them into a label
lab <- apply(lab$frex,1,function(x){paste(x,collapse=", ")})
lab
## Estimate the "effects" of popularity on topic proportions (causation runs the other way)
prep <- estimateEffect(1:K ~ treatvec, stm.out, out$meta)
## Plot the resulting "effects"
## First, plot without good labels and save the result
tmp <-plot.estimateEffect(prep, "treatvec", model=stm.out, method="difference",cov.value1=1,cov.value2=0)
## Then get the order of the effect sizes
topicOrder <- rev(order(unlist(tmp$means)))
## Then make a slightly prettier plot
par(mar=c(5,15,1,1))
plot.estimateEffect(prep, "treatvec", model=stm.out, topics=topicOrder, method="difference",cov.value1=1,cov.value2=0,
                          labeltype="custom",custom.labels=lab[topicOrder],width=100)

## Make a network plot of how the topics co-occur
## I calculate the proportion of words allocated to each topic
## using the word counts and the "theta" estimate from the topic model
wordcounts <- apply(dtm,1,sum)
## there are fractional wordcounts due to variational approximation.
round(stm.out$theta[,1] * wordcounts,2)

## Calculate the proportion of words devoted to topics
topicPropsInCorpus <- rep(NA,K)
for(i in 1:K){
  topicPropsInCorpus[i] <- (sum(stm.out$theta[,i] * wordcounts))/sum(wordcounts)
}
## This now holds the topic proportions in the corpus
topicPropsInCorpus
## sums to one, as it should
sum(topicPropsInCorpus)
## add the topic labels
names(topicPropsInCorpus) <- lab
## make a color ramp to indicate the correlation with popularity
mycols <- rep("#00000010",K)
for(i in 1:length(mycols)){
  mymean <- tmp$means[[i]]
  if(mymean < -.05){mycols[i] <- "#0000FF50"}
  if(mymean > .05){mycols[i] <- "#FF000070"}
}

## shut down the plotting device
dev.off() 
## Plot the network
set.seed(123);plot(topicCorr(stm.out), vlabels=lab, vertex.color=mycols, vertex.size=topicPropsInCorpus*200)


##########
## Re-do everything with Arabic text
##########

##########
## Arabic: Collecting data by "scraping"
##########

## We will extract the question and answer from this fatwa:
## https://ar.islamway.net/fatwa/13964
html <- readLines("https://ar.islamway.net/fatwa/13964", encoding="UTF-8")

## First, we will make the html into one single line
html <- paste(html, collapse=" ")
html

## As before, we want to extract things, but now the task is trickier
## because R doesn't play as nicely with Arabic.
## Luckily, the question is offset with "<div class="question html">" and </div>
## and the answer is offset with "<div class="answer html">" and "</div>"

## Here's a new version of the "cleanHtml" function that doesn't
## man-handle the Arabic

cleanHtml <- function(input, # some html from an English-language fatwa on islamway
                     split1, # the first piece of text at which to split
                     split2  # the second piece of text at which to split
                           ){
  tmp <- paste(input, collapse=" ")
  ## First, cut off everything before split1
  tmp <- strsplit(tmp,split1)[[1]][2]
  ## Then cut off everything after split 2
  tmp <- strsplit(tmp,split2)[[1]][1]
  ## remove html tags
  tmp <- gsub("<.*?>"," ", tmp)
  ## remove leading and trailing spaces
  tmp <- str_trim(tmp)
  ## clean up weird characters
  tmp <- gsub("&rsquo;|&#39;","'", tmp) 
  tmp <- gsub("&rdquo;","''", tmp) 
  tmp <- gsub("&lsquo;","`", tmp) 
  tmp <- gsub("&ldquo;","''", tmp) 
  tmp <- gsub("&laquo;","''", tmp)
  tmp <- gsub("&raquo;","''", tmp)
  tmp <- gsub("&nbsp;"," ", tmp)
  tmp <- gsub("&ndash;"," ", tmp)
  tmp <- gsub("&acirc;","a", tmp)
  tmp <- gsub("&icirc;","i", tmp)
  tmp <- gsub("&ucirc;","u", tmp)
  tmp <- gsub("&zwnj;","",tmp)
  ## return the result
  return(tmp)
}

cleanHtml(input = html, split1 = "<div class=\"question html\">", split2 = "</div>")
## Beautiful!  But getting all our sofware to work with Arabic
## unicode is a real headache.  We are going to use the arabicStemR
## package (that I wrote) to stem the text, and to transliterate it into
## roman characters.

## This is what the stemming and stopword removal does:
stem(cleanHtml(input = html, split1 = "<div class=\"question html\">", split2 = "</div>"), transliteration=F)

## ...and this is the same thing but with transliteration (the default):
stem(cleanHtml(input = html, split1 = "<div class=\"question html\">", split2 = "</div>"))


## Collect a set of Arabic-language fatwas

## This is the page on islamway with the wildly popular fatwas
# https://ar.islamway.net/fatawa?view=visits

## I get the first four pages of most popular fatwas
topFatwaLinks <- c("https://ar.islamway.net/fatawa?view=visits",
                   "https://ar.islamway.net/fatawa?view=visits&json=1&page=2&lid=127402&lrank=577480",
                   "https://ar.islamway.net/fatawa?view=visits&json=1&page=3&lid=179551&lrank=447565",
                   "https://ar.islamway.net/fatawa?view=visits&json=1&page=4&lid=117647&lrank=393566")

linkList <- c()  ## an empty object to hold the links

## Loop over each of the "most popular" pages
for(i in 1:length(topFatwaLinks)){
  html <- readLines(topFatwaLinks[i])
  html <- paste(html,collapse=" ")
  ## get the fatwa links
  tmp <- strsplit(html,"href=\"/fatwa/")[[1]]
  ## get rid of the first element
  tmp <- tmp[-1]
  ## Then, we loop over each of the links in tmp to get just the number
  for(i in 1:length(tmp)){
    tmp[i] <- strsplit(tmp[i],"/")[[1]][1]
  }
  ## remove duplicates
  tmp <- unique(tmp)
  ## add the result to the holder
  linkList <- c(linkList, tmp)
}

## Look at the length of the result
length(linkList)

## The "new fatwas" list is programmed differently, so we can't get it
## as easily.  But since most fatwas on islamway are in Arabic, we can just
## randomly sample, and many numbers will work.

## N is the size of the sample I'll try
N <- 400
## This line randomly samples 400 numbers to try
set.seed(1234);mysample <- sample(1:75949, N, replace=F)
## I place these 400 in the "linkList" object
linkList <- unique(c(linkList, mysample))

## Now, we can do our loop from before, getting the html
fatwaLinks <- linkList

## An empty holder for the html
htmlholder <- c()

## This loop is commented out because it takes a while.
## The saved object is below

#for(i in 1:length(fatwaLinks)){
  mylink <- fatwaLinks[i]
  print(paste("link",i,":",mylink))
  ## get the link html (pasting the link number to the link base)
  tmp <- "nothing"
  try(tmp <- readLines(paste0("http://ar.islamway.net/fatwa/",mylink), encoding="UTF-8"))
  htmlholder[[i]] <- tmp
}
## name the items in the holder with the link names
names(htmlholder) <- fatwaLinks

## get rid of the sampled numbers that didn't correspond to a fatwa
## See how many there are before
length(htmlholder)
## Remove those that have "nothing"
htmlholder <- htmlholder[-which(lapply(htmlholder,function(x){x[1]=="nothing"})==T)]
## See how many are left
length(htmlholder)

## replace this object so it all lines up
fatwaLinks <- names(htmlholder)

## I save the object here
#saveRDS(htmlholder,"htmlholderArabic.rds")
## Load the object here rather than run the loop above
htmlholder <- readRDS("htmlholderArabic.rds")
## We need to create this object in case it wasn't created above 
## (if we didn't run the loop)
fatwaLinks <- names(htmlholder)

## Now, we loop over the html, get the html, and fill in the info
fatwadat <- as.data.frame(matrix(NA,length(fatwaLinks),6))
fatwadat
## Let's make the column names match the info
colnames(fatwadat) <- c("question","answer","views","upvotes","downvotes","date")
## Let's make the row names match the fatwa link ids
rownames(fatwadat) <- fatwaLinks
fatwadat


for(i in 1:length(htmlholder)){
  html <- htmlholder[[i]]
  mylink <- names(htmlholder)[i]
  ## get the question and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,1] <- stem(cleanHtml(input = html, split1 = "<div class=\"question html\">", split2 = "</div>"))
  ## get the answer and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,2] <- stem(cleanHtml(input = html, split1 = "<div class=\"answer html\">", split2 = "</div>"))
  ## get the views and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,3] <- cleanHtml(input = html, split1 = "<span class=\"views-count\">", split2 = "</span>")
  ## the number of upvotes and downvotes and put it in the data frame (indexed as [row, column])
  fatwadat[mylink,4] <- cleanHtml(input = html, split1 = "<span class=\"up-votes\">", split2 = "</span>")
  fatwadat[mylink,5] <- cleanHtml(input = html, split1 = "<span class=\"down-votes\">", split2 = "</span>")
  ## add the date
  fatwadat[mylink,6] <- cleanHtml(input = html, split1 = "<span class=\"darker\">", split2 = "</div>")
}

## We can inspect the whole data frame
fatwadat

## Now we can look at the views
fatwadat$views <- as.numeric(gsub(",","",fatwadat$views))
## Look at summary statistics of the views
summary(fatwadat$views)
## Look at a histogram of the views
hist(fatwadat$views, breaks=30)

##########
## Arabic: Make a Document-Term Matrix
##########

## Fist, we combine the questions and the answers by pasting them together.
fatwatext <- paste(fatwadat[,1],fatwadat[,2])
## look at the result
fatwatext[1]

## Then, let's replace a punctuation mark that becomes a problem
## in the common word "Qur'aan": '
fatwatext <- gsub("'","",fatwatext)

## make a corpus that combines the Question and Answer into a single document
corp <- Corpus(VectorSource(fatwatext))
## Look at the result
corp
## Look at the text for a single element
corp[[1]]$content

## make a document-term matrix
## Note -- now we are not cleaning up the text at all in this step
dtm <- DocumentTermMatrix(corp, 
         control = list(tolower=F, stemming = F, stopwords = F, minWordLength = 1,
         removeNumbers = F, removePunctuation = F))

## check the dimensions of the dtm
dim(dtm)

## make the text data substantially less sparse
dtm <- removeSparseTerms(dtm, 0.99)
## check the new dimensions
dim(dtm)

## make it a data frame
dtm <- as.data.frame(as.matrix(dtm))

## add the document names as rownames
rownames(dtm) <- rownames(fatwadat)

## Look at the most frequent terms
sort(apply(dtm,2,sum))

## make sure all the docs have words left (remember, we got rid of rare words)
doclen <- apply(dtm,1,sum)
min(doclen) ## should be more than 0.  If not, there will be problems later.

##########
## Arabic: Structural Topic Model: What topics correlate with fatwa popularity?
##########

## make a variable indicating the popular documents

## figure out the rate of page views for each document
## 1) First, figure out how many days since publication
fatwadat$daysSincePublication <- as.numeric(as.Date("2017-11-05")-as.Date(fatwadat$date))
## 2) Then calculate how many page views per day
fatwadat$viewRate <- fatwadat$views/fatwadat$daysSincePublication
## Look at a histogram of the rate of page views
hist(fatwadat$viewRate)
## Make a scatter plot of the page view count and the page view rate
plot(fatwadat$views,fatwadat$viewRate)

## make a variable that is "1" if a document is one of the popular ones and "0" otherwise
fatwadat$popular <- as.numeric(fatwadat$views > 100000)
table(fatwadat$popular) 
#   0   1 
# 196  49

## We make an object called "treatvec" that we'll add to a "meta-data" object for 
## the structural topic model.
treatvec <- fatwadat$popular 
## make the meta-data object
meta2 <- as.data.frame(treatvec)
## the stm() function wants the DTM in a different format, so the Matrix() command helps:
dtmM <- Matrix(as.matrix(dtm))
## This preproccesses the text for the stm() function
processed <- readCorpus(dtmM, type="dtm")
## This prepares everything for the stm() function
out <- prepDocuments(processed$documents, processed$vocab, meta2)
## We have to pick a number of topics to estimate.
## I'm arbitrarily picking 15.  We will get different results with different
## numbers of topics
K <- 15
## Estimate the stm model (while timing it)
system.time({
set.seed(1234);stm.out <- stm(out$documents, out$vocab, K=K, prevalence=~treatvec,data=out$meta)
})

## look at the 5 words most associated with each topic
lab <- labelTopics(stm.out,n=5)
## look at the result
lab
## get the FREX words and make them into a label
lab <- apply(lab$frex,1,function(x){paste(x,collapse=", ")})
lab
## Make the labels into Arabic again
labArabic <- sapply(lab, reverse.transliterate)
labArabic
## Estimate the "effects" of popularity on topic proportions (causation runs the other way)
prep <- estimateEffect(1:K ~ treatvec, stm.out, out$meta)
## Plot the resulting "effects"
## First, plot without good labels and save the result
tmp <-plot.estimateEffect(prep, "treatvec", model=stm.out, method="difference",cov.value1=1,cov.value2=0)
## Then get the order of the effect sizes
topicOrder <- rev(order(unlist(tmp$means)))
## Then make a slightly prettier plot
par(mar=c(5,15,1,1))
plot.estimateEffect(prep, "treatvec", model=stm.out, topics=topicOrder, method="difference",cov.value1=1,cov.value2=0,
                          labeltype="custom",custom.labels=labArabic[topicOrder],width=100)

## Note: R for Mac does not seem to be able to correctly plot Arabic:
## https://stackoverflow.com/questions/22423760/right-to-left-languages-support-in-r-using-mac
## https://stackoverflow.com/questions/22423760/right-to-left-languages-support-in-r-using-mac
## A real shame...and a reason I use Windows for this.

## Here's the plot in my transliterated Arabic
plot.estimateEffect(prep, "treatvec", model=stm.out, topics=topicOrder, method="difference",cov.value1=1,cov.value2=0,
                          labeltype="custom",custom.labels=lab[topicOrder],width=100)


## Make a network plot of how the topics co-occur
## I calculate the proportion of words allocated to each topic
## using the word counts and the "theta" estimate from the topic model
wordcounts <- apply(dtm,1,sum)
## there are fractional wordcounts due to variational approximation.
round(stm.out$theta[,1] * wordcounts,2)

## Calculate the proportion of words devoted to topics
topicPropsInCorpus <- rep(NA,K)
for(i in 1:K){
  topicPropsInCorpus[i] <- (sum(stm.out$theta[,i] * wordcounts))/sum(wordcounts)
}
## This now holds the topic proportions in the corpus
topicPropsInCorpus
## sums to one, as it should
sum(topicPropsInCorpus)
## add the topic labels
names(topicPropsInCorpus) <- lab
## make a color ramp to indicate the correlation with popularity
mycols <- rep("#00000010",K)
for(i in 1:length(mycols)){
  mymean <- tmp$means[[i]]
  if(mymean < -.05){mycols[i] <- "#0000FF25"}
  if(mymean > .05){mycols[i] <- "#FF000025"}
}

## shut down the plotting device
dev.off() 
## Plot the network
set.seed(123);plot(topicCorr(stm.out), vlabels=labArabic, vertex.color=mycols, vertex.size=topicPropsInCorpus*200)

## Plot the network with transliterated Arabic
set.seed(123);plot(topicCorr(stm.out), vlabels=lab, vertex.color=mycols, vertex.size=topicPropsInCorpus*200)


##########
## End
##########
