Solution to practical problem 1.3 # (a) # (i) # The R function "dmnorm

advertisement
# Solution to practical problem 1.3
# (a)
# (i)
# The R function "dmnorm" given below computes the pdf of a mixture of m normal
# distributions:
# f(x) = alpha[1]*N(mean[1]1,sd[1]) + alpha[2]*N(mean[2],sd[2])+..+alpha[m]*N(mean[m],sd[m])
dmnorm<-function(x,alpha,mean,sd)
{
m<-length(alpha)
pdf<-rep(0,length(x))
for (i in 1:m)
{
pdf<-pdf+alpha[i]*dnorm(x,mean[i],sd[i])
}
return(pdf)
}
# plot the density of the normal mixture for the given parameter values
alpha<-c(0.4,0.3,0.3)
mean<-c(10,20,30)
sd<-c(2,3,2)
x<-seq(0,40,length=100)
pdf<-dmnorm(x,alpha,mean,sd)
plot(x,pdf,type="l",main="Density of a mixture of 3 normal distributions")
# (ii)
# A not particularly efficient version of "rmnorm", a function for generating random numbers
# from the above distribution (it uses a loop)
rmnorm<-function(n,alpha,mean,sd)
{
m<-length(alpha)
rx<-numeric(n)
for (i in 1:n)
{
ind<-sample(1:m,1,prob=alpha)
rx[i]<-rnorm(1,mean[ind],sd[ind])
}
return(rx)
}
# Note: The numeric() command creates a real vector of the specified length with 0 entries
# A more efficient version of "rmnorm" (no loop)
rmnorm<-function(n,alpha,mean,sd)
{
m<-length(alpha)
ind<-sample(1:m,n,replace=T,prob=alpha)
rx<-rnorm(n,mean[ind],sd[ind])
return(rx)
}
# test the function rmnorm() by generating a sample of size 1000 and comparing the
histogram
# with the mixture density
rx<-rmnorm(1000,alpha,mean,sd)
hist(rx,prob=T,ylim=c(0,0.1),breaks=10,main="histogram of random sample and normal
mixture density",xlab="x",xlim=c(min(0,min(rx)),max(max(rx),40)))
# Note: the nclass-option defines the intended number of bins of the histogram; the actual
# number of bins may diverge from that number
x<-seq(0,40,length=100)
pdf<-dmnorm(x,alpha,mean,sd)
lines(x,pdf,type="l",col="red")
# (b)
# generate n=50 observations using rmnorm()
set.seed(1)
# note: the set.seed()-command is used to fix the starting point for the random number
generation
n<-50
rx<-rmnorm(n,alpha,mean,sd)
# load the library sm
library(sm)
# compare the histogram of the sample and the density estimates for different bandwidht
selectors
# with the normal mixture pdf
par(mfrow=c(2,2))
hist(rx,prob=T,ylim=c(0,0.1),xlim=c(0,40))
lines(x,pdf,type="l",col="red")
# Note : x and pdf have already been defined above
sm.density(rx,h=hnorm(rx),ylim=c(0,0.1))
# Note: the h=hnorm(rx)-option computes the normal-based optimal bandwidth
# Note: by default the sm.density()-command produces a plot of the estimated density;
therefore
# graphical parameters can be set within this command
# Note: The small bars at the bottom of the figure which mark the observations vary between
the
# plots since, just for the plot, the sm.density()-function adds a small random component to
the
# observations (which is useful if there are many identical observations)
title(main=paste("normal-based bandwidth \n h=",round(hnorm(rx),2)))
lines(x,pdf,type="l",col="red")
sm.density(rx,h=hcv(rx),ylim=c(0,0.1))
# Note: the h=hcv(rx)-option computes the cross-validation optimal bandwidth
title(main=paste("cross-validation bandwidth \n h=",round(hcv(rx),2)))
lines(x,pdf,type="l",col="red")
sm.density(rx,h=hsj(rx),ylim=c(0,0.1))
# Note: the h=hsj(rx)-option computes the Shealter-Jones plug-in optimal bandwidth
title(main=paste("plug-in bandwidth \n h=",round(hsj(rx),2)))
lines(x,pdf,type="l",col="red")
# Note: Since the known original pdf is more wiggly than a normal distribution the normalbased
# optimal bandwidth is too large leading to a density estimate which is too smooth
# (c)
# Repeat the commands given under (b), but with n<-20 and n<-100
# Conclusion: The "hnorm" clearly oversmooths, that is selects h to large.
# (d)
# Read the tree-data
trees<-read.table("D:/DATA/AST/tree.dat",header=TRUE)
# Note: you may need to change the path that points to the data
attach(trees)
# Note: the attach()-command makes the columns of the dataset directly accessible via their
# names
# draw a random sample of size 100 from the tree heights
set.seed(1)
sh<-sample(height,100)
# Estimate the density from the height tree sample using the three different bandwidth
selectors
# and compare the density estimates with the histogram of the tree height population
par(mfrow=c(2,2))
hist(sh,prob=T,ylim=c(0,0.15),xlim=c(0,45),main="")
sm.density(sh,h=hnorm(sh),add=T,col=2)
# Note: if the add=T-option is used, the density estimate plot is added to the existing plot
title(main=paste("normal-based bandwidth \n h=",round(hnorm(sh),2)))
hist(sh,prob=T,ylim=c(0,0.15),xlim=c(0,45),main="")
sm.density(sh,h=hcv(sh),add=T,col=2)
title(main=paste("cross-validation bandwidth \n h=",round(hcv(sh),2)))
hist(sh,prob=T,ylim=c(0,0.15),xlim=c(0,45),main="")
sm.density(sh,h=hsj(sh),add=T,col=2)
title(main=paste("plug-in bandwidth \n h=",round(hsj(sh),2)))
hist(height,prob=T,breaks=seq(0,45,2),ylim=c(0,0.15),xlim=c(0,45))
Download