
## This file relies on the filepaths defined in arabicTextWorkshop.R

dataDirectory <- "ahramArticles/"

dir.create(dataDirectory)

##########
## Collect several week's worth of articles from the Al-Ahram online archive and save the text
##########

## To collect multiple articles, we need to navigate the hyperlink
## structure of the Al-Ahram archive.

## The archive is structured by dates, so we will need to specify dates
## to collect.  Here, I use the seq() command to generate a sequence of dates
## starting with 2009/12/27.

myDates <- seq(as.Date("2009/12/27"), by = "day", length.out = 14)

## We can look at the dates to make sure that it worked correctly.

myDates


## We collect the articles for each day in a set of nested for loops.  Each
## loop must have its own counter.  I use k in the outer loop, i in the middle loop,
## and j in the inner loop.  I indent each loop by two spaces so the code
## is easier to read.


## Start the loop over each one of the dates
for(k in 1:length(myDates)){

  ## make a string that indicates where we want to save each file of the data

  saveLocation <- paste0(dataDirectory,myDates[k])

  ## If the directory has already been created, skip to the next iteration of the loop
 
  if(dir.exists(saveLocation)){next}
  
  ## Create the directory where we will save the data

  dir.create(saveLocation)

  ## From our exploration of the structure of the archive, we see that we can put the date
  ## into the URL to navigate to a specific day.  The first page we navigate to is the
  ## index page which has links to all of the subcategories of articles in the newspaper
  ## that day.

  ## Use the gsub() command to put the k-th element of myDates into the format
  ## of the URL.

  dateString <- gsub("-","/",myDates[k])

  ## paste the strings together to make the URL

  url <- paste0("http://www.ahram.org.eg/archive/news/", dateString, "/index.aspx")

  ## use the readLines command to collect the html from the url
   
  rawHtml <- readLines(url, encoding="UTF-8")

  ## save the url for the index page for that date.
  ## We combine the url and the html (so that we can remember where we got the html from).

  saveHtml <- c(url,rawHtml)

  ## We change the encoding to "bytes"

  Encoding(saveHtml) <- "bytes"

  ## We save the index file as "index.txt" (there is only one per date).

  writeLines(saveHtml, paste0(saveLocation, "/index.txt"))

  ## Our next task is to navigate to the sub-index pages for each section of the 
  ## newspaper in the sidebar.  From our exploration of the structure, we see that
  ## we can pull out the sidebar if we get all of the html between the strings
  ## "RightMenu_MainCats" and "RightMenu_NewMainMenu".  I'll call this object links.

  links <- rawHtml[grep("RightMenu_MainCats",rawHtml):grep("RightMenu_NewMainMenu",rawHtml)]

  ## We use the grep() function to get only the lines that contain "a href=.*?index.aspx'"
  ## This is a regular expression that matches strings like "a href= xxxxxxxx index.aspx"

  links <- links[grep("a href=.*?index.aspx'",links)]

  ## we can use regex to remove the html code around the links

  links <- gsub("^.*?href='|'>.*?$","",links)

  ## We will navigate to each link and save the html code, like we did for the index.
  ## Begin the middle loop over each subsection.

  for(i in 1:length(links)){

    ## make a url object that is the i-th element of links

    url <- links[i]

    ## use readLines() to get the hteml

    rawHtml <- readLines(url, encoding="UTF-8")

    ## Get the number associated with each section of the newspaper by using
    ## regex to parse the url.

    indexNumber <- gsub("/index.aspx","",gsub(paste0("^.*",dateString,"/"),"",url))

    ## Then combine the url and the html and save them as before.  This time we 
    ## name the file "subindex_" and paste in the indexNumber from the previous line.

    saveHtml <- c(url,rawHtml)
    Encoding(saveHtml) <- "bytes"
    writeLines(saveHtml, paste0(saveLocation, "/subindex_",indexNumber,".txt"))

    ## From our navigation of the structure, we see that we can finally see a list
    ## of all articles in this section on this day.  Our goal is to get the article 
    ## links and collect each of the articles.

    ## We see that in the html, the article links are between "OuterNews" and "Facebook Like Box".
    ## We use grep() to get those lines.

    articleLinks <- rawHtml[grep("OuterNews",rawHtml):grep("Facebook Like Box",rawHtml)]

    ## We use the same grep() regex expression as before to get the article links.

    articleLinks <- paste(articleLinks[grep("href=.*?.aspx'",articleLinks)],collapse=" ")

    ## We need to split up the links because they come in a big paragraph.  We omit the 
    ## first element because it is not a link (it is the stuff before a link).

    articleLinks <- unlist(strsplit(articleLinks,"href='"))[-1]

    ## Then we use gsub to clean up the link strings

    articleLinks <- gsub(".aspx.*?$",".aspx", articleLinks)

    ## We don't need to collect the indeix, so we remove the index.aspx if it's in the list

    if("index.aspx" %in% articleLinks){articleLinks <- articleLinks[-which(articleLinks=="index.aspx")]}

    ## There may not be any links for a given subsection, so we skip to the next i iteration
    ## if the length of the articleLinks object is 0

    if(length(articleLinks)==0){next}

    ## Now we have the links we need to collect the article html for the current section.
    ## We begin an inner loop indixed by j over the length of the object "articleLinks".

    for(j in 1:length(articleLinks)){

      ## We print a note to ourselves to see which iteration we are on

      print(paste("scraping article",j,"of subindex",indexNumber,"of day",myDates[k]))
 
      ## Paste the string together to make the full url for each article

      articleUrl <- paste0("http://www.ahram.org.eg/archive/",articleLinks[j])
       
      ## use readLines to collect the article html

      articleHtml <- readLines(articleUrl, encoding="UTF-8")

      ## Each article has a unique number from the URL.  We use regex to get just the number.

      articleNumber <- gsub("[A-Za-z]|[:punct:]|/|\\.|-","",articleUrl)

      ## save the article html as before

      saveArticleHtml <- c(articleUrl,articleHtml)
      Encoding(saveArticleHtml) <- "bytes"
      articleSavePath <- paste0(saveLocation, "/article_",indexNumber,"_",articleNumber,".txt")
      writeLines(saveArticleHtml, articleSavePath)

    } # End loop over j articleLinks
  } # End loop over i subindex pages
} # End loop over k dates


## This will take a long time to run because the Al-Ahram website loads each page slowly.
## At the end, you will have a set of directories names by date.  Within each, you'll
## have an index page, subindex pages, and the article html, all in separate files.


