### R code from vignette source '../univariate_probability_chapter/univariate_probability.Rnw' ################################################### ### code chunk number 1: helperFunction ################################################### roundN <- function(x,decimals=2,fore=5) sprintf(paste("%",fore,".",decimals,"f",sep=""),x) ################################################### ### code chunk number 2: bernoulliCDF ################################################### x <- seq(-1,2,by=0.01) y <- ifelse(x < 0, 0, ifelse(x<1, 0.6, 1)) plot(y ~ x, type ="l", yaxt="n",ylab=expression(F(x)), xlab=expression(x)) axis(2, at=c(0, 0.6, 1), labels=c(0, expression(1-pi), 1),las=1) ################################################### ### code chunk number 3: uniformPDF ################################################### x <- seq(0, 5, by=0.01) y <- sapply(x, function(x) ifelse(x > 3 & x < 4, 1, 0)) plot(y ~ x, xaxt="n", yaxt="n",ylim=c(0,1.2),type="l", ylab=expression(p(x))) axis(1, c(0, 3, 4), c(0, expression(a), expression(b))) axis(2, c(0, 1), c(0, expression(frac(1,b-a))), las=1) ################################################### ### code chunk number 4: uniformCDF ################################################### x <- seq(0,5, by=0.01) y <- sapply(x, function(x) if(x < 3) 0 else if(x < 4) x - 3 else 1) plot(y ~ x, xaxt="n",ylim=c(0,1.2),type="l",ylab=expression(P(x))) axis(1, c(0, 3, 4), c(0, expression(a), expression(b))) ################################################### ### code chunk number 5: hertzFrequencyExample ################################################### x <- seq(0,2000,by=1) y <- log(x,base=10) p <- function(x) {ifelse(x >= 100 & x <= 1000,1/900,0)} plot(x,p(x),type="l",ylim=c(0,0.005),xlab="Frequency (Hertz)",ylab="p(X=x)") ################################################### ### code chunk number 6: hertzFrequencyExampleLog ################################################### p1 <- function(y) {ifelse(10^y >= 100 & 10^y <= 1000, 1/900 * 10^y * log(10),0)} plot(y,p1(y),type="l",xlab="log-Hertz",ylab="p(Y=y)") ################################################### ### code chunk number 7: constituentOrderProbsFn ################################################### probs <- function(g1,g2,g3) { f <- c(g1*g2*g3,g1*g2*(1-g3),g1*(1-g2)*(1-g3),(1-g1)*g2*g3,(1-g1)*(1-g2)*g3,(1-g1)*(1-g2)*(1-g3)) return(f/sum(f)) } g1 <- 0.9 g2 <- 0.8 g3 <- 0.5 p <- probs(g1,g2,g3) freqs <- c(566,488,95,25,11,4) ################################################### ### code chunk number 8: normalPDF ################################################### musigsq <- function(mu1, s1) substitute(paste(mu,"=",mu1,", ",sigma^2,"=",s1), list(mu1=mu1, s1=s1)) x <- seq(-5,5,by=0.01) plot(dnorm(x) ~ x, type="l",ylim=c(0,0.85),lwd=3,cex.lab=1.5, ylab=expression(p(x))) lines(dnorm(x,0,2) ~ x, lty=2, col=3,lwd=3) lines(dnorm(x,0,1/2) ~ x, lty=3, col=2,lwd=3) lines(dnorm(x,2) ~ x, lty=4, col="magenta",lwd=3) legend(-5,0.8, as.expression(c(musigsq(0,1),musigsq(0,2),musigsq(0,0.5),musigsq(2,1))), lty=c(1,2,3,4), col=c(1,3,2,"magenta"),lwd=3,cex=1.2) ################################################### ### code chunk number 9: normalCDF ################################################### plot(pnorm(x) ~ x, type="l",ylim=c(0,1),lwd=3,cex.lab=1.5,ylab=expression(P(X 0.5) acc[i+1] <- sum(testing.data == 1) / N * 4 else acc[i+1] <- sum(testing.data == 0) / N * 4 } ################################################### ### code chunk number 16: f0kdeMultipleBandwidths ################################################### plot(density(aa$F0, bw=2),lwd=3,lty=3,col="green",xlab="F0 frequency (Hz)", ylab=expression(p(x)), main="",cex.lab=1.5,cex.axis=1.5) lines(density(aa$F0, bw=10),lwd=3,lty=2,col="magenta") lines(density(aa$F0, bw=30),lwd=3,lty=1,col="black") legend(145,0.03,c("bw=30","bw=10","bw=2"),lwd=3,lty=1:3,col=c("black","magenta","green"),cex=1.5) rug(aa$F0) ################################################### ### code chunk number 17: likelihoodInKernelEstimation ################################################### f <- function(x,y,bw) { d <- density(x,bw=bw,n=300,from=51,to=350) ## set up the grid of positions at which the kernel density is estimated idx <- sapply(y,function(z) which.max(z==d$x)) ## figure out which position on the grid on which the density is estimated each observation corresponds to sum(log(d$y[idx])) } b <- seq(0.1,30,by=0.1) ## set up the range of bandwidths to be examined result <- sapply(b, function(bw) f(aa$F0,aa$F0,bw)) ################################################### ### code chunk number 18: likelihoodInKernelEstimationPlot ################################################### plot(b,result,type="l",xlab="Bandwidth",ylab="Log-likelihood")#,ylim=c(-550,-250)) ################################################### ### code chunk number 19: likelihoodInKernelEstimationCrossval ################################################### set.seed(10) idx <- sample(1:length(aa$F0)) ## creates a random ordering of the integers from 1 to length(aa$F0); each of the K folds is just a contiguous 1/K chunk of this ordering g <- function(x,bw,idx,folds=6) { ll <- numeric(folds) for(i in 1:folds) { j <- 1:(length(x)/folds) + (length(x)/folds) * (i-1) ## pull out the i-th contiguous 1/K chunk training <- x[idx[-j]] testing <- x[idx[j]] ll[i] <- f(training,testing,bw=bw) } sum(ll) } ll.cval <- sapply(b, function(bw) g(aa$F0,bw,idx)) ## For too narrow a bandwidth, observations on the right tail will have probability indistinguishable from zero, ## so we exclude these from plotting. ################################################### ### code chunk number 20: likelihoodInKernelEstimationCrossvalPlot ################################################### plot(b[ll.cval > -Inf],ll.cval[ll.cval > -Inf],type="l",xlab="Bandwidth",ylab="Cross-validated log-likelihood") ################################################### ### code chunk number 21: brownFirstWords ################################################### plot.word.freqs <- function(dat,offset=10,ylim=NULL) { tot <- sum(dat$Count) idx <- order(dat$Count,decreasing=TRUE) x <- 1:length(idx) if(is.null(ylim)) ylim <- c(0,1.1*max(dat$Count/tot)) plot(x,(dat$Count/tot)[idx[x]],type='h',lwd=1,ylim=1.1*ylim) text.x.pos <- 0.9*max(x)*1:10/10 text.y.pos <- dat$Count[idx[1:10]]/tot arrows(1:10+offset,text.y.pos,text.x.pos,text.y.pos,code=0,col="gray") text(text.x.pos,text.y.pos,paste(round(-log2(text.y.pos),2),dat$Word[idx[1:10]],sep="\n"),pos=3,offset=0.25) } ## Data obtained from command: tregex '__ !< __ !, __' brown-trees-all.noempties | egrep '^[A-Za-z]*$' | sort | uniq -c | tail +2 > ~/ling/prob_lx_book/data/parsed-brown-first-words-of-sentence.txt dat1 <- read.table("../data/parsed-brown-first-words-of-sentence.txt",col.names=c("Count","Word"),quote="") tot1 <- sum(dat1$Count) idx1 <- order(dat1$Count,decreasing=TRUE) ylim <- c(0,max(dat1$Count/tot1)) plot.word.freqs(dat1,ylim=ylim) word1.probs <- dat1$Count/tot1 word1.surprisals <- -log2(word1.probs) ################################################### ### code chunk number 22: brownSecondWords ################################################### ## data obtained from command: tregex '__ !< __ , (__ !< __ !, __)' brown-trees-all.noempties | egrep '^[A-Za-z]*$' | sort | uniq -c | tail +2 > ~/ling/prob_lx_book/data/parsed-brown-second-words-of-sentence.txt dat2 <- read.table("../data/parsed-brown-second-words-of-sentence.txt",col.names=c("Count","Word"),quote="") tot2 <- sum(dat2$Count) idx2 <- order(dat2$Count,decreasing=TRUE) plot.word.freqs(dat2,ylim=ylim) word2.probs <- dat2$Count/tot2 word2.surprisals <- -log2(word2.probs) ################################################### ### code chunk number 23: brownFirstPOS ################################################### ## ddata obtained from command: tregex -u '__ <( __ ! < __)!, __' brown-trees-all.noempties | egrep -o '^[A-Za-z$]*' | sort | uniq -c | tail +2 > ~/ling/prob_lx_book/data/parsed-brown-first-pos-of-sentence.txt posDat1 <- read.table("../data/parsed-brown-first-pos-of-sentence.txt",col.names=c("Count","Word"),quote="") posTot1 <- sum(posDat1$Count) ylim <- c(0,max(posDat1$Count/posTot1)) plot.word.freqs(posDat1,offset=0.25,ylim=ylim) pos1.probs <- with(posDat1,tapply(Count,Word,sum))/posTot1 pos1.surprisals <- -log2(pos1.probs) ################################################### ### code chunk number 24: brownSecondPOS ################################################### ## ddata obtained from command: tregex -u '__ < (__ !< __) , (__ !< __ !, __)' brown-trees-all.noempties | egrep -o '^[A-Za-z$]*' | sort | uniq -c | tail +2 > ~/ling/prob_lx_book/data/parsed-brown-second-pos-of-sentence.txt posDat2 <- read.table("../data/parsed-brown-second-pos-of-sentence.txt",col.names=c("Count","Word"),quote="") posTot2 <- sum(posDat2$Count) plot.word.freqs(posDat2,offset=0.25,ylim=ylim) pos2.probs <- with(posDat2,tapply(Count,Word,sum))/posTot2 pos2.surprisals <- -log2(pos2.probs)