## Corrections to the stm textProcessor function
## and two other functions it relies on.
## Rich Nielsen
## 3/22/19

## Modified textProcessor function
textProcessor2 <- function (documents, metadata = NULL, lowercase = TRUE, removestopwords = TRUE, 
    removenumbers = TRUE, removepunctuation = TRUE, stem = TRUE, 
    wordLengths = c(3, Inf), sparselevel = 1, language = "en", 
    verbose = TRUE, onlycharacter = FALSE, striphtml = FALSE, 
    customstopwords = NULL, v1 = FALSE) 
{
    if (!requireNamespace("tm", quietly = TRUE)) 
        stop("Please install tm package to use this function. You will also need SnowballC if stemming.")
    if (!(utils::packageVersion("tm") >= 0.6)) 
        stop("Please install at least version 0.6 of the tm package.")
    if (stem) {
        if (!requireNamespace("SnowballC", quietly = TRUE)) 
            stop("Please install SnowballC to use stemming.")
    }
    documents <- as.character(documents)
    if (striphtml) {
        documents <- gsub("<.+?>", " ", documents)
    }
    documents <- stringr::str_replace_all(documents, "[^[:graph:]]", 
        " ")
    if (onlycharacter) {
        documents <- gsub("[^[:alnum:]///' ]", " ", documents)
    }
    if (verbose) 
        cat("Building corpus... \n")
    txt <- tm::VCorpus(tm::VectorSource(documents), readerControl = list(language = language))
    txt <- tm::tm_map(txt, tm::stripWhitespace)
    if (lowercase) {
        if (verbose) 
            cat("Converting to Lower Case... \n")
        if (utils::packageVersion("tm") >= "0.6") {
            txt <- tm::tm_map(txt, tm::content_transformer(tolower))
        }
        else {
            txt <- tm::tm_map(txt, tolower)
        }
    }
    if (!v1) {
        if (removepunctuation) {
            if (verbose) 
                cat("Removing punctuation... \n")
            txt <- tm::tm_map(txt, tm::removePunctuation, preserve_intra_word_dashes = TRUE)
        }
    }
    if (removestopwords) {
        if (verbose) 
            cat("Removing stopwords... \n")
        txt <- tm::tm_map(txt, tm::removeWords, tm::stopwords(language))
    }
    if (!is.null(customstopwords)) {
        if (verbose) 
            cat("Remove Custom Stopwords...\n")
        txt <- tm::tm_map(txt, tm::removeWords, customstopwords)
    }
    if (removenumbers) {
        if (verbose) 
            cat("Removing numbers... \n")
        txt <- tm::tm_map(txt, tm::removeNumbers)
    }
    if (v1) {
        if (removepunctuation) {
            if (verbose) 
                cat("Removing punctuation... \n")
            txt <- tm::tm_map(txt, tm::removePunctuation, preserve_intra_word_dashes = TRUE)
        }
    }
    if (stem) {
        if (verbose) 
            cat("Stemming... \n")
        txt <- tm::tm_map(txt, tm::stemDocument, language = language)
    }
    if (!is.null(metadata)) {
        for (i in 1:ncol(metadata)) {
            NLP::meta(txt, colnames(metadata)[i]) <- metadata[, 
                i]
        }
    }
    if (verbose) 
        cat("Creating Output... \n")
    dtm <- tm::DocumentTermMatrix(txt, control = list(wordLengths = wordLengths,tolower=F))
    if (sparselevel != 1) {
        ntokens <- sum(dtm$v)
        V <- ncol(dtm)
        dtm <- tm::removeSparseTerms(dtm, sparselevel)
        if (ncol(dtm) < V & verbose) {
            message <- sprintf("Removed %i of %i terms (%i of %i tokens) due to sparselevel of %s \n", 
                V - ncol(dtm), V, ntokens - sum(dtm$v), ntokens, 
                sparselevel)
            cat(message)
        }
    }
    if (!is.null(metadata)) {
        if (inherits(metadata, "data.frame")) 
            metadata <- as.data.frame(metadata)
        docindex <- unique(dtm$i)
        metadata <- NLP::meta(txt)[docindex, , drop = FALSE]
    }
    out <- read.slam(dtm)
    kept <- (1:length(documents) %in% unique(dtm$i))
    vocab <- as.character(out$vocab)
    out <- list(documents = out$documents, vocab = vocab, meta = metadata, 
        docs.removed = which(!kept))
    class(out) <- "textProcessor"
    return(out)
}

read.slam <- function(corpus) {
  #convert a simple triplet matrix to list format.
  if(!inherits(corpus, "simple_triplet_matrix")) stop("corpus is not a simple triplet matrix")
  if ("TermDocumentMatrix" %in% class(corpus)) {
    non_empty_docs <- which(slam::col_sums(corpus) != 0)
    documents <- ijv.to.doc(corpus[,non_empty_docs]$j, corpus[,non_empty_docs]$i, corpus[,non_empty_docs]$v) 
    names(documents) <- corpus[,non_empty_docs]$dimnames$Docs
   } else {
    non_empty_docs <- which(slam::row_sums(corpus) != 0)
    documents <- ijv.to.doc(corpus[non_empty_docs,]$i, corpus[non_empty_docs,]$j, corpus[non_empty_docs,]$v) 
    names(documents) <- corpus[non_empty_docs,]$dimnames$Docs
  }
  vocab <- corpus$dimnames$Terms
  return(list(documents=documents,vocab=vocab))
}


#Triplet Format to our Document Format
ijv.to.doc <- function(i,j,v) {
  index <- split(j,i)
  index <- lapply(index,as.integer)
  count <- split(v,i)
  count <- lapply(count,as.integer)
  mapply(rbind,index,count, SIMPLIFY=FALSE)
}

