# Annotated R session to illustrate fitting a mixture of 2 normals, # following Chapter 18 of the text > library(MASS) > data(geyser) > attach(geyser) ## first we plot the data and a density estimate > truehist(waiting,xlim=c(35,115),ymax=0.04,h=5) > wait.dns <- density(waiting,n=512,width="SJ") > lines(wait.dns,lty=2) # mix.obj is a function to compute the negative log-likelihood # the parameter is specified as a vector of length 5 > mix.obj <- function(p,x) { e <- p[1] *dnorm((x-p[2])/p[3])/p[3] + (1-p[1])*dnorm((x-p[4])/p[5])/p[5] if(any(e <= 0)) Inf else -sum(log(e)) } > p0 <- c(0.36, 50, 5, 80, 5) # these starting values taken from the book ## Nelder-Mead optimization (the simplex method) > optim(p0,mix.obj,x=waiting) $par [1] 0.3075467 54.1999336 4.9515962 80.3568069 7.5085281 $value [1] 1157.542 $counts function gradient 147 NA $convergence [1] 0 $message NULL ## method BFGS, a quasi-Newton method > optim(p0,mix.obj,x=waiting,method="BFGS") $par [1] 8188.12630 73.29862 12.90303 801.67155 1485.53616 $value [1] -1475.754 $counts function gradient 122 100 $convergence [1] 1 $message NULL ## it failed, but why? here is what the book did > optim(p0,mix.obj,x=waiting,method="BFGS", control=list(parscale=c(.11,rep(1,4)))) $par [1] 0.3075900 54.2024434 4.9518185 80.3602804 7.5077183 $value [1] 1157.542 $counts function gradient 42 19 $convergence [1] 0 $message NULL ## here's what the help file says about parscale parscale A vector of scaling values for the parameters. Optimization is performed on par/parscale and these should be comparable in the sense that a unit change in any element produces about a unit change in the scaled value. > c(.11,rep(1,4)) [1] 0.11 1.00 1.00 1.00 1.00 ## this would have to be determined by trial and error; but these mixture ## models are tricky to fit and it's usually pi that causes trouble ## now try a version with derivatives ## we could compute the derivatives by hand, but I'll use deriv3 as ## on p.438 > lmix2 <- deriv3( + ~ -log(p*dnorm((x-u1)/s1)/s1 + (1-p)*dnorm((x-u2)/s2)/s2), + c("p","u1","s1","u2","s2"), + function(x,p,u1,s1,u2,s2) NULL) ## we seem to need this function, even though we already have mix.obj ## because mix.obj takes a parameter argument of length 5, whereas this ## version takes 5 scalar parameters ## this gives a really complicated result; here's how it is used > mix.gr <- function(p,x){ + u1 <- p[2]; s1 <- p[3]; u2 <- p[4]; s2 <- p[5]; p <- p[1] + colSums(attr(lmix2(x,p,u1,s1,u2,s2),"gradient"))} ## after exploring you can verify that the attribute called gradient ## of lmix2 is of dimension 299 x 5; the gradient is evaluated at 299 ## values for each of the 5 parameters > optim(p0,mix.obj,mix.gr,x=waiting,method="BFGS",hessian = TRUE, control=list(parscale=c(.11,rep(1,4)))) $par [1] 0.3075900 54.2024436 4.9518189 80.3602802 7.5077181 $value [1] 1157.542 $counts function gradient 43 19 $convergence [1] 0 $message NULL $hessian [,1] [,2] [,3] [,4] [,5] [1,] 1302.324447 -7.9155320 -11.6701192 -6.5967516 12.7068775 [2,] -7.915532 3.0608623 -1.1456662 -0.4822492 0.8365219 [3,] -11.670119 -1.1456662 5.2997642 -0.6549970 0.9902167 [4,] -6.596752 -0.4822492 -0.6549970 3.2307670 0.8919459 [5,] 12.706877 0.8365219 0.9902167 0.8919459 5.4255334 ## hessian = TRUE produces the last matrix, which is used to ## estimate the variances and covariances of the mle's ## We can investigate the performance of deriv3 in computing 2nd derivs: ## similarly colSums(attr(lmix2(...),"hessian") is a 5x5 matrix > phat <- .Last.value$par > colSums(attr(lmix2(waiting,phat[1],phat[2],phat[3],phat[4],phat[5]), "hessian")) p u1 s1 u2 s2 p 1302.314648 -7.9155228 -11.6701125 -6.5967384 12.7068465 u1 -7.915523 3.0608623 -1.1456661 -0.4822492 0.8365219 s1 -11.670112 -1.1456661 5.2997634 -0.6549970 0.9902167 u2 -6.596738 -0.4822492 -0.6549970 3.2307670 0.8919459 s2 12.706847 0.8365219 0.9902167 0.8919459 5.4255331