## Working with Unicode in R -- Rich Nielsen ## I always forget how unicode works in R, and I don't find it very intuitive. ## Here are some examples so that I can remember how to do things because ## most of the online "help" is unhelpful. ## here's some arabic a <- "\u628\u633\u645 \u0627\uFDF2 \u0627\u0644\u0631\u062D\u0645\u0646 \u0627\u0644\u0631\u062D\u064A\u0645" a ## It prints nicely in R Encoding(a) ## If you want to write it out to a text file, first change the encoding to "bytes" Encoding(a) <- "bytes" ## This will write it out to a text file writeLines(a ,"out.txt") ## Surprisingly, this won't work. It writes out in ASCII. Not super helpful. write(a,"out2.txt") ## Reading in utf-8 text: b <- readLines("out.txt",encoding = "UTF-8") b ## sometimes, arabic gets dumped into files and looks like this: x <- "" x ## Not very useful...we'd like to convert x back to unicode ## http://stackoverflow.com/questions/17761858/converting-a-u-escaped-unicode-string-to-ascii x1 <- paste(paste0("\\u",strsplit(gsub("<|>","",x), "U+",fixed=T)[[1]][-1]), collapse="") x2 <- parse(text = paste0("'", x1, "'")) x3 <- x2[[1]] x3 ## Another thing you might want to do is use unicode in urls ## It turns out that this is represented differently and needs to be fed into a url correctly as below: a <- "\u628\u633\u645 \u0627\uFDF2 \u0627\u0644\u0631\u062D\u0645\u0646 \u0627\u0644\u0631\u062D\u064A\u0645" a # the arabic URLencode(a) # the arabic in url encoding ## Now we can use this to automate google searches, for example browseURL(paste("https://www.google.com/webhp?hl=en&lr=&ie=ISO-8859-1&btnG=Search&gws_rd=ssl#hl=en&lr=&q=",URLencode(a), "&btnG=Search", sep="")) ## This can also be done with the RCurl library library(rCurl) curlEscape(a) ## Sometimes, Arabic comes in cp1256 encoding. R can't deal with it ## here is a page in cp1256 encoding url <- "http://www.almahmood.islamlight.net/index.php?option=content&task=view&id=2609&Itemid=25" dat <- readLines(url, encoding="bytes") ## note that we're bringing it in as bytes dat[500] ## an example of what the arabic looks like ## the iconv() function will do the conversion iconv(dat[500], from="CP1256", to="UTF-8") ## you can also do the whole document iconv(dat, from="CP1256", to="UTF-8") ## Before I found iconv(), I wrote this function below which does the same thing. ## There's no point in using it because it's far slower than iconv(), but the building ## blocks might still be helpful for other problems. ## source the function replacing cp1256 with utf-8 characters # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT source("http://www.mit.edu/~rnielsen/convert%20cp1256.txt") ## the function "cp1256ToUTF8()" should now be available cp1256ToUTF8 ## convert just a line from the document datUtf8 <- cp1256ToUTF8(dat[500]) datUtf8 ## convert the whole document dat <- paste(dat, collapse=" ") dat <- gsub("<.*?>", " ", dat) ## note that some errors get thrown if the function runs into words it can't parse cp1256ToUTF8(dat) ## Sometimes, against your better judgment you use MS excel for some spreadsheet that has Arabic unicode ## If so, bringing it into R is either tricky or impossible, depending on your version of MS excel. ## In general, I find the best results from saving the .xls[x] file as "Unicode" in excel, then opening ## that file in a text editor and saving it as utf-8. Then, I open it in R with something like: dat <- read.table("path/to/file.txt", as.is=T, header = T,fill = T, sep="\t",encoding="UTF-8") ## UPDATE: the readr library can at least write out Arabic code dat <- data.frame(v1=rep(1,3),v2=rep(0,3),v3=rep(a,3),stringsAsFactors=F) dat ## This should display badly, with ## But this should display arabic: dat$v3 ## and we can write it out with the arabic displaying correctly in MS excel using library(readr) write_excel_csv(dat,"dat.csv") ## and we can read it in with the readr function newdat <- read_csv("dat.csv") head(newdat) ## ...or with the normal function that gives us a data frame newdat <- read.csv("dat.csv", as.is=T, header = T,encoding="UTF-8") newdat newdat$v3 ## should print out arabic ## If you run into Arabic in the old "Presentation Forms" UTF-8 encoding, here's a function ## for transliterating it into the 600 UTF-8 range. ## Ok, I can't find a tool online in R that will do the unicode conversion for me, so I'm ## just going to make a function and then put it into the arabicStemR code convertArabicPresentationForms <- function(texts){ ## Coded from http://www.unicode.org/charts/PDF/UFE70.pdf texts <- gsub('[\uFE70-\uFE71]','\u064B', texts) texts <- gsub('\uFE72','\u064d', texts) texts <- gsub('\uFE74','\u064d', texts) texts <- gsub('[\uFE76-\uFE77]','\u064e', texts) texts <- gsub('[\uFE78-\uFE79]','\u064f', texts) texts <- gsub('[\uFE7A-\uFE7B]','\u0650', texts) texts <- gsub('[\uFE7C-\uFE7D]','\u0651', texts) texts <- gsub('[\uFE7E-\uFE7F]','\u0652', texts) texts <- gsub('\uFE80','\u0621', texts) texts <- gsub('[\uFE81-\uFE82]','\u0622', texts) texts <- gsub('[\uFE83-\uFE84]','\u0623', texts) texts <- gsub('[\uFE85-\uFE86]','\u0624', texts) texts <- gsub('[\uFE87-\uFE88]','\u0625', texts) texts <- gsub('[\uFE89-\uFE8C]','\u0626', texts) texts <- gsub('[\uFE8D-\uFE8E]','\u0627', texts) texts <- gsub('[\uFE8F-\uFE92]','\u0628', texts) texts <- gsub('[\uFE93-\uFE94]','\u0629', texts) texts <- gsub('[\uFE95-\uFE98]','\u062A', texts) texts <- gsub('[\uFE99-\uFE9C]','\u062B', texts) texts <- gsub('[\uFE9D-\uFEA0]','\u062C', texts) texts <- gsub('[\uFEA1-\uFEA4]','\u062D', texts) texts <- gsub('[\uFEA5-\uFEA8]','\u062E', texts) texts <- gsub('[\uFEA9-\uFEAA]','\u062F', texts) texts <- gsub('[\uFEAB-\uFEAC]','\u0630', texts) texts <- gsub('[\uFEAD-\uFEAE]','\u0631', texts) texts <- gsub('[\uFEAF-\uFEB0]','\u0632', texts) texts <- gsub('[\uFEB1-\uFEB4]','\u0633', texts) texts <- gsub('[\uFEB5-\uFEB8]','\u0634', texts) texts <- gsub('[\uFEB9-\uFEBC]','\u0635', texts) texts <- gsub('[\uFEBD-\uFEC0]','\u0636', texts) texts <- gsub('[\uFEC1-\uFEC4]','\u0637', texts) texts <- gsub('[\uFEC5-\uFEC8]','\u0638', texts) texts <- gsub('[\uFEC9-\uFECC]','\u0639', texts) texts <- gsub('[\uFECD-\uFED0]','\u063A', texts) texts <- gsub('[\uFED1-\uFED4]','\u0641', texts) texts <- gsub('[\uFED5-\uFED8]','\u0642', texts) texts <- gsub('[\uFED9-\uFEDC]','\u0643', texts) texts <- gsub('[\uFEDD-\uFEE0]','\u0644', texts) texts <- gsub('[\uFEE1-\uFEE4]','\u0645', texts) texts <- gsub('[\uFEE5-\uFEE8]','\u0646', texts) texts <- gsub('[\uFEE9-\uFEEC]','\u0647', texts) texts <- gsub('[\uFEED-\uFEEE]','\u0648', texts) texts <- gsub('[\uFEEF-\uFEF0]','\u0649', texts) texts <- gsub('[\uFEF1-\uFEF4]','\u064A', texts) texts <- gsub('[\uFEF5-\uFEF6]','\u0644\u0622', texts) texts <- gsub('[\uFEF7-\uFEF8]','\u0644\u0623', texts) texts <- gsub('[\uFEF9-\uFEFA]','\u0644\u0625', texts) texts <- gsub('[\uFEFB-\uFEFC]','\u0644\u0627', texts) texts <- gsub('\uFEFF','', texts) return(texts) }