# Annotated R session to illustrate fitting a mixture of 2 normals, 
# following Chapter 18 of the text

> library(MASS)
> data(geyser)
> attach(geyser)

## first we plot the data and a density estimate

> truehist(waiting,xlim=c(35,115),ymax=0.04,h=5)
> wait.dns <- density(waiting,n=512,width="SJ")
> lines(wait.dns,lty=2)


# mix.obj is a function to compute the negative log-likelihood
# the parameter is specified as a vector of length 5

> mix.obj <-  function(p,x)
{

e <- p[1] *dnorm((x-p[2])/p[3])/p[3] +
(1-p[1])*dnorm((x-p[4])/p[5])/p[5]
if(any(e <= 0)) Inf else -sum(log(e)) }

> p0 <- c(0.36, 50, 5, 80, 5)  # these starting values taken from the book 


## Nelder-Mead optimization (the simplex method)

>   optim(p0,mix.obj,x=waiting)
$par
[1]  0.3075467 54.1999336  4.9515962 80.3568069  7.5085281

$value
[1] 1157.542

$counts
function gradient 
     147       NA 

$convergence
[1] 0

$message
NULL

## method BFGS, a quasi-Newton method

>   optim(p0,mix.obj,x=waiting,method="BFGS")
$par
[1] 8188.12630   73.29862   12.90303  801.67155 1485.53616

$value
[1] -1475.754

$counts
function gradient 
     122      100 

$convergence
[1] 1

$message
NULL

## it failed, but why?  here is what the book did

> optim(p0,mix.obj,x=waiting,method="BFGS", 		    control=list(parscale=c(.11,rep(1,4))))

$par
[1]  0.3075900 54.2024434  4.9518185 80.3602804  7.5077183

$value
[1] 1157.542

$counts
function gradient 
      42       19 

$convergence
[1] 0

$message
NULL

## here's what the help file says about parscale

parscale
A vector of scaling values for the parameters. Optimization is performed on par/parscale and these should be comparable in the sense that a unit change in any element produces about a unit change in the scaled value.

> c(.11,rep(1,4))
[1] 0.11 1.00 1.00 1.00 1.00

## this would have to be determined by trial and error; but these mixture
## models are tricky to fit and it's usually pi that causes trouble

## now try a version with derivatives
## we could compute the derivatives by hand, but I'll use deriv3 as
## on p.438

> lmix2 <- deriv3(
+ ~ -log(p*dnorm((x-u1)/s1)/s1 + (1-p)*dnorm((x-u2)/s2)/s2),
+ c("p","u1","s1","u2","s2"),
+ function(x,p,u1,s1,u2,s2) NULL)

## we seem to need this function, even though we already have mix.obj
## because mix.obj takes a parameter argument of length 5, whereas this
## version takes 5 scalar parameters

## this gives a really complicated result; here's how it is used

> mix.gr <- function(p,x){
+ u1 <- p[2]; s1 <- p[3]; u2 <- p[4]; s2 <- p[5]; p <- p[1]
+ colSums(attr(lmix2(x,p,u1,s1,u2,s2),"gradient"))}


## after exploring you can verify that the attribute called gradient
## of lmix2 is of dimension 299 x 5; the gradient is evaluated at 299
## values for each of the 5 parameters

> optim(p0,mix.obj,mix.gr,x=waiting,method="BFGS",hessian = TRUE, control=list(parscale=c(.11,rep(1,4))))
$par
[1]  0.3075900 54.2024436  4.9518189 80.3602802  7.5077181

$value
[1] 1157.542

$counts
function gradient 
      43       19 

$convergence
[1] 0

$message
NULL

$hessian
            [,1]       [,2]        [,3]       [,4]       [,5]
[1,] 1302.324447 -7.9155320 -11.6701192 -6.5967516 12.7068775
[2,]   -7.915532  3.0608623  -1.1456662 -0.4822492  0.8365219
[3,]  -11.670119 -1.1456662   5.2997642 -0.6549970  0.9902167
[4,]   -6.596752 -0.4822492  -0.6549970  3.2307670  0.8919459
[5,]   12.706877  0.8365219   0.9902167  0.8919459  5.4255334

## hessian = TRUE produces the last matrix, which is used to
## estimate the variances and covariances of the mle's

## We can investigate the performance of deriv3 in computing 2nd derivs:

## similarly colSums(attr(lmix2(...),"hessian") is a 5x5 matrix 

> phat <- .Last.value$par  

> colSums(attr(lmix2(waiting,phat[1],phat[2],phat[3],phat[4],phat[5]),
"hessian"))

             p         u1          s1         u2         s2
p  1302.314648 -7.9155228 -11.6701125 -6.5967384 12.7068465
u1   -7.915523  3.0608623  -1.1456661 -0.4822492  0.8365219
s1  -11.670112 -1.1456661   5.2997634 -0.6549970  0.9902167
u2   -6.596738 -0.4822492  -0.6549970  3.2307670  0.8919459
s2   12.706847  0.8365219   0.9902167  0.8919459  5.4255331