## Working with Unicode in R -- Rich Nielsen

## I always forget how unicode works in R, and I don't find it very intuitive.
## Here are some examples so that I can remember how to do things because
## most of the online "help" is unhelpful.

## here's some arabic
a <- "\u628\u633\u645 \u0627\uFDF2 \u0627\u0644\u0631\u062D\u0645\u0646 \u0627\u0644\u0631\u062D\u064A\u0645"
a

## It prints nicely in R
Encoding(a)

## If you want to write it out to a text file, first change the encoding to "bytes"
Encoding(a) <- "bytes"
## This will write it out to a text file
writeLines(a ,"out.txt")

## Surprisingly, this won't work. It writes out in ASCII.  Not super helpful.
write(a,"out2.txt")

## Reading in utf-8 text:
b <- readLines("out.txt",encoding = "UTF-8")
b

## sometimes, arabic gets dumped into files and looks like this:
x <- "<U+0627><U+0644><U+0635><U+0641><U+062D><U+0629>"
x
## Not very useful...we'd like to convert x back to unicode
## http://stackoverflow.com/questions/17761858/converting-a-u-escaped-unicode-string-to-ascii
x1 <- paste(paste0("\\u",strsplit(gsub("<|>","",x), "U+",fixed=T)[[1]][-1]), collapse="")
x2 <- parse(text = paste0("'", x1, "'"))
x3 <- x2[[1]]
x3

## Another thing you might want to do is use unicode in urls
## It turns out that this is represented differently and needs to be fed into a url correctly as below:
a <- "\u628\u633\u645 \u0627\uFDF2 \u0627\u0644\u0631\u062D\u0645\u0646 \u0627\u0644\u0631\u062D\u064A\u0645"
a  # the arabic
URLencode(a)  # the arabic in url encoding
## Now we can use this to automate google searches, for example
browseURL(paste("https://www.google.com/webhp?hl=en&lr=&ie=ISO-8859-1&btnG=Search&gws_rd=ssl#hl=en&lr=&q=",URLencode(a), "&btnG=Search", sep=""))
## This can also be done with the RCurl library
library(rCurl)
curlEscape(a)


## Sometimes, Arabic comes in cp1256 encoding.  R can't deal with it

## here is a page in cp1256 encoding
url <- "http://www.almahmood.islamlight.net/index.php?option=content&task=view&id=2609&Itemid=25"
dat <- readLines(url, encoding="bytes")  ## note that we're bringing it in as bytes
dat[500]  ## an example of what the arabic looks like

## the iconv() function will do the conversion
iconv(dat[500], from="CP1256", to="UTF-8")
## you can also do the whole document
iconv(dat, from="CP1256", to="UTF-8")

## Before I found iconv(), I wrote this function below which does the same thing.
## There's no point in using it because it's far slower than iconv(), but the building
## blocks might still be helpful for other problems.

## source the function replacing cp1256 with utf-8 characters
# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT
source("http://www.mit.edu/~rnielsen/convert%20cp1256.txt")
## the function "cp1256ToUTF8()" should now be available
cp1256ToUTF8

## convert just a line from the document
datUtf8 <- cp1256ToUTF8(dat[500])
datUtf8

## convert the whole document
dat <- paste(dat, collapse=" ")
dat <- gsub("<.*?>", " ", dat)
## note that some errors get thrown if the function runs into words it can't parse 
cp1256ToUTF8(dat)


## Sometimes, against your better judgment you use MS excel for some spreadsheet that has Arabic unicode
## If so, bringing it into R is either tricky or impossible, depending on your version of MS excel.
## In general, I find the best results from saving the .xls[x] file as "Unicode" in excel, then opening
## that file in a text editor and saving it as utf-8.  Then, I open it in R with something like:
dat <- read.table("path/to/file.txt", as.is=T, header = T,fill = T, sep="\t",encoding="UTF-8")
## UPDATE: the readr library can at least write out Arabic code
dat <- data.frame(v1=rep(1,3),v2=rep(0,3),v3=rep(a,3),stringsAsFactors=F)
dat
## This should display badly, with <U+xxx>
## But this should display arabic:
dat$v3
## and we can write it out with the arabic displaying correctly in MS excel using
library(readr)
write_excel_csv(dat,"dat.csv")
## and we can read it in with the readr function
newdat <- read_csv("dat.csv")
head(newdat)
## ...or with the normal function that gives us a data frame
newdat <- read.csv("dat.csv", as.is=T, header = T,encoding="UTF-8")
newdat
newdat$v3
## should print out arabic

## If you run into Arabic in the old "Presentation Forms" UTF-8 encoding, here's a function
## for transliterating it into the 600 UTF-8 range.

## Ok, I can't find a tool online in R that will do the unicode conversion for me, so I'm 
## just going to make a function and then put it into the arabicStemR code

convertArabicPresentationForms <- function(texts){
    ## Coded from http://www.unicode.org/charts/PDF/UFE70.pdf
    texts <- gsub('[\uFE70-\uFE71]','\u064B', texts)
    texts <- gsub('\uFE72','\u064d', texts)
    texts <- gsub('\uFE74','\u064d', texts)
    texts <- gsub('[\uFE76-\uFE77]','\u064e', texts)
    texts <- gsub('[\uFE78-\uFE79]','\u064f', texts)
    texts <- gsub('[\uFE7A-\uFE7B]','\u0650', texts)
    texts <- gsub('[\uFE7C-\uFE7D]','\u0651', texts)
    texts <- gsub('[\uFE7E-\uFE7F]','\u0652', texts)
    texts <- gsub('\uFE80','\u0621', texts)
    texts <- gsub('[\uFE81-\uFE82]','\u0622', texts)
    texts <- gsub('[\uFE83-\uFE84]','\u0623', texts)
    texts <- gsub('[\uFE85-\uFE86]','\u0624', texts)
    texts <- gsub('[\uFE87-\uFE88]','\u0625', texts)
    texts <- gsub('[\uFE89-\uFE8C]','\u0626', texts)
    texts <- gsub('[\uFE8D-\uFE8E]','\u0627', texts)
    texts <- gsub('[\uFE8F-\uFE92]','\u0628', texts)
    texts <- gsub('[\uFE93-\uFE94]','\u0629', texts)
    texts <- gsub('[\uFE95-\uFE98]','\u062A', texts)
    texts <- gsub('[\uFE99-\uFE9C]','\u062B', texts)
    texts <- gsub('[\uFE9D-\uFEA0]','\u062C', texts)
    texts <- gsub('[\uFEA1-\uFEA4]','\u062D', texts)
    texts <- gsub('[\uFEA5-\uFEA8]','\u062E', texts)
    texts <- gsub('[\uFEA9-\uFEAA]','\u062F', texts)
    texts <- gsub('[\uFEAB-\uFEAC]','\u0630', texts)
    texts <- gsub('[\uFEAD-\uFEAE]','\u0631', texts)
    texts <- gsub('[\uFEAF-\uFEB0]','\u0632', texts)
    texts <- gsub('[\uFEB1-\uFEB4]','\u0633', texts)
    texts <- gsub('[\uFEB5-\uFEB8]','\u0634', texts)
    texts <- gsub('[\uFEB9-\uFEBC]','\u0635', texts)
    texts <- gsub('[\uFEBD-\uFEC0]','\u0636', texts)
    texts <- gsub('[\uFEC1-\uFEC4]','\u0637', texts)
    texts <- gsub('[\uFEC5-\uFEC8]','\u0638', texts)
    texts <- gsub('[\uFEC9-\uFECC]','\u0639', texts)
    texts <- gsub('[\uFECD-\uFED0]','\u063A', texts)
    texts <- gsub('[\uFED1-\uFED4]','\u0641', texts)
    texts <- gsub('[\uFED5-\uFED8]','\u0642', texts)
    texts <- gsub('[\uFED9-\uFEDC]','\u0643', texts)
    texts <- gsub('[\uFEDD-\uFEE0]','\u0644', texts)
    texts <- gsub('[\uFEE1-\uFEE4]','\u0645', texts)
    texts <- gsub('[\uFEE5-\uFEE8]','\u0646', texts)
    texts <- gsub('[\uFEE9-\uFEEC]','\u0647', texts)
    texts <- gsub('[\uFEED-\uFEEE]','\u0648', texts)
    texts <- gsub('[\uFEEF-\uFEF0]','\u0649', texts)
    texts <- gsub('[\uFEF1-\uFEF4]','\u064A', texts)
    texts <- gsub('[\uFEF5-\uFEF6]','\u0644\u0622', texts)
    texts <- gsub('[\uFEF7-\uFEF8]','\u0644\u0623', texts)
    texts <- gsub('[\uFEF9-\uFEFA]','\u0644\u0625', texts)
    texts <- gsub('[\uFEFB-\uFEFC]','\u0644\u0627', texts)
    texts <- gsub('\uFEFF','', texts)
    return(texts)
}