# Solution to practical problem 1.3 # (a) # (i) # The R function "dmnorm" given below computes the pdf of a mixture of m normal # distributions: # f(x) = alpha[1]*N(mean[1]1,sd[1]) + alpha[2]*N(mean[2],sd[2])+..+alpha[m]*N(mean[m],sd[m]) dmnorm<-function(x,alpha,mean,sd) { m<-length(alpha) pdf<-rep(0,length(x)) for (i in 1:m) { pdf<-pdf+alpha[i]*dnorm(x,mean[i],sd[i]) } return(pdf) } # plot the density of the normal mixture for the given parameter values alpha<-c(0.4,0.3,0.3) mean<-c(10,20,30) sd<-c(2,3,2) x<-seq(0,40,length=100) pdf<-dmnorm(x,alpha,mean,sd) plot(x,pdf,type="l",main="Density of a mixture of 3 normal distributions") # (ii) # A not particularly efficient version of "rmnorm", a function for generating random numbers # from the above distribution (it uses a loop) rmnorm<-function(n,alpha,mean,sd) { m<-length(alpha) rx<-numeric(n) for (i in 1:n) { ind<-sample(1:m,1,prob=alpha) rx[i]<-rnorm(1,mean[ind],sd[ind]) } return(rx) } # Note: The numeric() command creates a real vector of the specified length with 0 entries # A more efficient version of "rmnorm" (no loop) rmnorm<-function(n,alpha,mean,sd) { m<-length(alpha) ind<-sample(1:m,n,replace=T,prob=alpha) rx<-rnorm(n,mean[ind],sd[ind]) return(rx) } # test the function rmnorm() by generating a sample of size 1000 and comparing the histogram # with the mixture density rx<-rmnorm(1000,alpha,mean,sd) hist(rx,prob=T,ylim=c(0,0.1),breaks=10,main="histogram of random sample and normal mixture density",xlab="x",xlim=c(min(0,min(rx)),max(max(rx),40))) # Note: the nclass-option defines the intended number of bins of the histogram; the actual # number of bins may diverge from that number x<-seq(0,40,length=100) pdf<-dmnorm(x,alpha,mean,sd) lines(x,pdf,type="l",col="red") # (b) # generate n=50 observations using rmnorm() set.seed(1) # note: the set.seed()-command is used to fix the starting point for the random number generation n<-50 rx<-rmnorm(n,alpha,mean,sd) # load the library sm library(sm) # compare the histogram of the sample and the density estimates for different bandwidht selectors # with the normal mixture pdf par(mfrow=c(2,2)) hist(rx,prob=T,ylim=c(0,0.1),xlim=c(0,40)) lines(x,pdf,type="l",col="red") # Note : x and pdf have already been defined above sm.density(rx,h=hnorm(rx),ylim=c(0,0.1)) # Note: the h=hnorm(rx)-option computes the normal-based optimal bandwidth # Note: by default the sm.density()-command produces a plot of the estimated density; therefore # graphical parameters can be set within this command # Note: The small bars at the bottom of the figure which mark the observations vary between the # plots since, just for the plot, the sm.density()-function adds a small random component to the # observations (which is useful if there are many identical observations) title(main=paste("normal-based bandwidth \n h=",round(hnorm(rx),2))) lines(x,pdf,type="l",col="red") sm.density(rx,h=hcv(rx),ylim=c(0,0.1)) # Note: the h=hcv(rx)-option computes the cross-validation optimal bandwidth title(main=paste("cross-validation bandwidth \n h=",round(hcv(rx),2))) lines(x,pdf,type="l",col="red") sm.density(rx,h=hsj(rx),ylim=c(0,0.1)) # Note: the h=hsj(rx)-option computes the Shealter-Jones plug-in optimal bandwidth title(main=paste("plug-in bandwidth \n h=",round(hsj(rx),2))) lines(x,pdf,type="l",col="red") # Note: Since the known original pdf is more wiggly than a normal distribution the normalbased # optimal bandwidth is too large leading to a density estimate which is too smooth # (c) # Repeat the commands given under (b), but with n<-20 and n<-100 # Conclusion: The "hnorm" clearly oversmooths, that is selects h to large. # (d) # Read the tree-data trees<-read.table("D:/DATA/AST/tree.dat",header=TRUE) # Note: you may need to change the path that points to the data attach(trees) # Note: the attach()-command makes the columns of the dataset directly accessible via their # names # draw a random sample of size 100 from the tree heights set.seed(1) sh<-sample(height,100) # Estimate the density from the height tree sample using the three different bandwidth selectors # and compare the density estimates with the histogram of the tree height population par(mfrow=c(2,2)) hist(sh,prob=T,ylim=c(0,0.15),xlim=c(0,45),main="") sm.density(sh,h=hnorm(sh),add=T,col=2) # Note: if the add=T-option is used, the density estimate plot is added to the existing plot title(main=paste("normal-based bandwidth \n h=",round(hnorm(sh),2))) hist(sh,prob=T,ylim=c(0,0.15),xlim=c(0,45),main="") sm.density(sh,h=hcv(sh),add=T,col=2) title(main=paste("cross-validation bandwidth \n h=",round(hcv(sh),2))) hist(sh,prob=T,ylim=c(0,0.15),xlim=c(0,45),main="") sm.density(sh,h=hsj(sh),add=T,col=2) title(main=paste("plug-in bandwidth \n h=",round(hsj(sh),2))) hist(height,prob=T,breaks=seq(0,45,2),ylim=c(0,0.15),xlim=c(0,45))