### R code from vignette source '../univariate_probability_chapter/univariate_probability.Rnw'

###################################################
### code chunk number 1: helperFunction
###################################################
roundN <- function(x,decimals=2,fore=5) sprintf(paste("%",fore,".",decimals,"f",sep=""),x)


###################################################
### code chunk number 2: bernoulliCDF
###################################################
x <- seq(-1,2,by=0.01)
y <- ifelse(x < 0, 0, ifelse(x<1, 0.6, 1))
plot(y ~ x, type ="l", yaxt="n",ylab=expression(F(x)), xlab=expression(x))
axis(2, at=c(0, 0.6, 1), labels=c(0, expression(1-pi), 1),las=1)


###################################################
### code chunk number 3: uniformPDF
###################################################
x <- seq(0, 5, by=0.01)
y <- sapply(x, function(x) ifelse(x > 3 & x < 4, 1, 0))
plot(y ~ x, xaxt="n", yaxt="n",ylim=c(0,1.2),type="l", ylab=expression(p(x)))
axis(1, c(0, 3, 4), c(0, expression(a), expression(b)))
axis(2, c(0, 1), c(0, expression(frac(1,b-a))), las=1)


###################################################
### code chunk number 4: uniformCDF
###################################################
x <- seq(0,5, by=0.01)
y <- sapply(x, function(x) if(x < 3) 0 else if(x < 4) x - 3 else 1)
plot(y ~ x, xaxt="n",ylim=c(0,1.2),type="l",ylab=expression(P(x)))
axis(1, c(0, 3, 4), c(0, expression(a), expression(b)))


###################################################
### code chunk number 5: hertzFrequencyExample
###################################################
x <- seq(0,2000,by=1)
y <- log(x,base=10)
p <- function(x) {ifelse(x >= 100 & x <= 1000,1/900,0)}
plot(x,p(x),type="l",ylim=c(0,0.005),xlab="Frequency (Hertz)",ylab="p(X=x)")


###################################################
### code chunk number 6: hertzFrequencyExampleLog
###################################################
p1 <- function(y) {ifelse(10^y >= 100 & 10^y <= 1000, 1/900 * 10^y * log(10),0)}
plot(y,p1(y),type="l",xlab="log-Hertz",ylab="p(Y=y)")


###################################################
### code chunk number 7: constituentOrderProbsFn
###################################################
probs <- function(g1,g2,g3) {
  f <- c(g1*g2*g3,g1*g2*(1-g3),g1*(1-g2)*(1-g3),(1-g1)*g2*g3,(1-g1)*(1-g2)*g3,(1-g1)*(1-g2)*(1-g3))
  return(f/sum(f))
}
g1 <- 0.9
g2 <- 0.8
g3 <- 0.5
p <- probs(g1,g2,g3)
freqs <- c(566,488,95,25,11,4)


###################################################
### code chunk number 8: normalPDF
###################################################
musigsq <- function(mu1, s1) substitute(paste(mu,"=",mu1,", ",sigma^2,"=",s1), list(mu1=mu1, s1=s1))
x <- seq(-5,5,by=0.01)
plot(dnorm(x) ~ x, type="l",ylim=c(0,0.85),lwd=3,cex.lab=1.5, ylab=expression(p(x)))
lines(dnorm(x,0,2) ~ x, lty=2, col=3,lwd=3)
lines(dnorm(x,0,1/2) ~ x, lty=3, col=2,lwd=3)
lines(dnorm(x,2) ~ x, lty=4, col="magenta",lwd=3)
legend(-5,0.8,
       as.expression(c(musigsq(0,1),musigsq(0,2),musigsq(0,0.5),musigsq(2,1))), lty=c(1,2,3,4), col=c(1,3,2,"magenta"),lwd=3,cex=1.2)


###################################################
### code chunk number 9: normalCDF
###################################################
plot(pnorm(x) ~ x, type="l",ylim=c(0,1),lwd=3,cex.lab=1.5,ylab=expression(P(X<x)))
lines(pnorm(x,0,2) ~ x, lty=2, col=3,lwd=3)
lines(pnorm(x,0,1/2) ~ x, lty=3, col=2,lwd=3)
lines(pnorm(x,2) ~ x, lty=4, col="magenta",lwd=3)
legend(-5,0.8,
       as.expression(c(musigsq(0,1),musigsq(0,2),musigsq(0,0.5),musigsq(2,1))), lty=c(1,2,3,4), col=c(1,3,2,"magenta"),lwd=3,cex=1.2)


###################################################
### code chunk number 10: f0rawHistogram
###################################################
xlim.kde <- c(80,210)
set.seed(100)
pb.dat <- read.table("../data/peterson_barney_data/pb.txt",header=T,quote="")
aa <- subset(pb.dat, Vowel=="aa" & Type=="m")
hist(jitter(aa$F0, amount=2.5), breaks=10000, xlab="F0 frequency (Hz)", main="", ylab="Count",cex.lab=1.5,cex.axis=1.5,xlim=xlim.kde)


###################################################
### code chunk number 11: f0histogramStartAt95
###################################################
hist(aa$F0,breaks=seq(95,215, by=5), freq=F, xlab="F0 frequency (Hz)", ylab=expression(p(x)), main="",cex.lab=1.5,cex.axis=1.5,xlim=xlim.kde)


###################################################
### code chunk number 12: f0histogramStartAt94
###################################################
hist(aa$F0,breaks=seq(94,284, by=5), freq=F, xlab="F0 frequency (Hz)", ylab=expression(p(x)), main="",cex.lab=1.5,cex.axis=1.5,xlim=xlim.kde)


###################################################
### code chunk number 13: f0kdeNormal
###################################################
plot(density(aa$F0, bw=5),lty=1,xlab="F0 frequency (Hz)", ylab=expression(p(x)), main="",cex.lab=1.5,cex.axis=1.5,xlim=xlim.kde)
rug(aa$F0)


###################################################
### code chunk number 14: f0kdeRectangular
###################################################
plot(density(aa$F0, kernel="rectangular",bw=20/sqrt(12)),lty=1,xlab="F0 frequency (Hz)", ylab=expression(p(x)), main="",cex.lab=1.5,cex.axis=1.5,xlim=xlim.kde) ## note that in R the "bw" parameter is the standard deviation of the kernel density function, so that for a desired rectangle of width W we use a bandwidth of W/sqrt(12)
rug(aa$F0)


###################################################
### code chunk number 15: crossvalidationClassificationAccuracyExample
###################################################
set.seed(23)
N <- 300
M <- 160
x <- c(rep(1,M),rep(0,N-M))
j <- sample(1:N,N)
total.correct <- 0
yHeldOut <- numeric(4)
acc <- numeric(4)
for(i in 0:3) {
  idx <- j[(1:(N/4)) + i*(N/4)]
  testing.data <- x[idx]
  training.data <- x[-idx]
  yHeldOut[i+1] <- sum(testing.data)
  ##print(c(sum(training.data),sum(training.data==0)))
  if(mean(training.data) > 0.5)
    acc[i+1] <- sum(testing.data == 1) / N * 4
  else
    acc[i+1] <- sum(testing.data == 0) / N * 4
}


###################################################
### code chunk number 16: f0kdeMultipleBandwidths
###################################################
plot(density(aa$F0, bw=2),lwd=3,lty=3,col="green",xlab="F0 frequency (Hz)", ylab=expression(p(x)), main="",cex.lab=1.5,cex.axis=1.5)
lines(density(aa$F0, bw=10),lwd=3,lty=2,col="magenta")
lines(density(aa$F0, bw=30),lwd=3,lty=1,col="black")
legend(145,0.03,c("bw=30","bw=10","bw=2"),lwd=3,lty=1:3,col=c("black","magenta","green"),cex=1.5)
rug(aa$F0)


###################################################
### code chunk number 17: likelihoodInKernelEstimation
###################################################
f <- function(x,y,bw) {
  d <- density(x,bw=bw,n=300,from=51,to=350) ## set up the grid of positions at which the kernel density is estimated
  idx <- sapply(y,function(z) which.max(z==d$x)) ## figure out which position on the grid on which the density is estimated each observation corresponds to
  sum(log(d$y[idx]))
}
b <- seq(0.1,30,by=0.1) ## set up the range of bandwidths to be examined
result <- sapply(b, function(bw) f(aa$F0,aa$F0,bw))


###################################################
### code chunk number 18: likelihoodInKernelEstimationPlot
###################################################
plot(b,result,type="l",xlab="Bandwidth",ylab="Log-likelihood")#,ylim=c(-550,-250))


###################################################
### code chunk number 19: likelihoodInKernelEstimationCrossval
###################################################
set.seed(10)
idx <- sample(1:length(aa$F0)) ## creates a random ordering of the integers from 1 to length(aa$F0); each of the K folds is just a contiguous 1/K chunk of this ordering 
g <- function(x,bw,idx,folds=6) {
  ll <- numeric(folds)
  for(i in 1:folds)  {
    j <- 1:(length(x)/folds) + (length(x)/folds) * (i-1) ## pull out the i-th contiguous 1/K chunk
    training <- x[idx[-j]]
    testing <- x[idx[j]]
    ll[i] <- f(training,testing,bw=bw)
  }
  sum(ll)
}
ll.cval <- sapply(b, function(bw) g(aa$F0,bw,idx))
## For too narrow a bandwidth, observations on the right tail will have probability indistinguishable from zero,
## so we exclude these from plotting.


###################################################
### code chunk number 20: likelihoodInKernelEstimationCrossvalPlot
###################################################
plot(b[ll.cval > -Inf],ll.cval[ll.cval > -Inf],type="l",xlab="Bandwidth",ylab="Cross-validated log-likelihood")


###################################################
### code chunk number 21: brownFirstWords
###################################################
plot.word.freqs <- function(dat,offset=10,ylim=NULL) {
  tot <- sum(dat$Count)
  idx <- order(dat$Count,decreasing=TRUE)
  x <- 1:length(idx)
  if(is.null(ylim))
    ylim <- c(0,1.1*max(dat$Count/tot))
  plot(x,(dat$Count/tot)[idx[x]],type='h',lwd=1,ylim=1.1*ylim)
  text.x.pos <- 0.9*max(x)*1:10/10
  text.y.pos <- dat$Count[idx[1:10]]/tot
  arrows(1:10+offset,text.y.pos,text.x.pos,text.y.pos,code=0,col="gray")
  text(text.x.pos,text.y.pos,paste(round(-log2(text.y.pos),2),dat$Word[idx[1:10]],sep="\n"),pos=3,offset=0.25)	
}

##  Data obtained from command: tregex '__ !< __ !, __' brown-trees-all.noempties | egrep '^[A-Za-z]*$' | sort  | uniq -c | tail +2  > ~/ling/prob_lx_book/data/parsed-brown-first-words-of-sentence.txt
dat1 <- read.table("../data/parsed-brown-first-words-of-sentence.txt",col.names=c("Count","Word"),quote="")
tot1 <- sum(dat1$Count)
idx1 <- order(dat1$Count,decreasing=TRUE)
ylim <- c(0,max(dat1$Count/tot1))
plot.word.freqs(dat1,ylim=ylim)
word1.probs <- dat1$Count/tot1
word1.surprisals <- -log2(word1.probs)


###################################################
### code chunk number 22: brownSecondWords
###################################################
## data obtained from command: tregex '__ !< __ , (__ !< __ !, __)' brown-trees-all.noempties | egrep '^[A-Za-z]*$' | sort  | uniq -c | tail +2  > ~/ling/prob_lx_book/data/parsed-brown-second-words-of-sentence.txt
dat2 <- read.table("../data/parsed-brown-second-words-of-sentence.txt",col.names=c("Count","Word"),quote="")
tot2 <- sum(dat2$Count)
idx2 <- order(dat2$Count,decreasing=TRUE)
plot.word.freqs(dat2,ylim=ylim)
word2.probs <- dat2$Count/tot2
word2.surprisals <- -log2(word2.probs)


###################################################
### code chunk number 23: brownFirstPOS
###################################################
## ddata obtained from command: tregex -u '__ <( __ ! < __)!, __' brown-trees-all.noempties | egrep -o '^[A-Za-z$]*' | sort  | uniq -c | tail +2  > ~/ling/prob_lx_book/data/parsed-brown-first-pos-of-sentence.txt
posDat1 <- read.table("../data/parsed-brown-first-pos-of-sentence.txt",col.names=c("Count","Word"),quote="")
posTot1 <- sum(posDat1$Count)
ylim <- c(0,max(posDat1$Count/posTot1))
plot.word.freqs(posDat1,offset=0.25,ylim=ylim)
pos1.probs <- with(posDat1,tapply(Count,Word,sum))/posTot1
pos1.surprisals <- -log2(pos1.probs)


###################################################
### code chunk number 24: brownSecondPOS
###################################################
## ddata obtained from command: tregex -u '__ < (__ !< __) , (__ !< __ !, __)' brown-trees-all.noempties | egrep -o '^[A-Za-z$]*' | sort  | uniq -c | tail +2  > ~/ling/prob_lx_book/data/parsed-brown-second-pos-of-sentence.txt
posDat2 <- read.table("../data/parsed-brown-second-pos-of-sentence.txt",col.names=c("Count","Word"),quote="")
posTot2 <- sum(posDat2$Count)
plot.word.freqs(posDat2,offset=0.25,ylim=ylim)
pos2.probs <- with(posDat2,tapply(Count,Word,sum))/posTot2
pos2.surprisals <- -log2(pos2.probs)