# STA 414, 2011 ASSIGNMENT 3. # # FUNCTIONS FOR FITTING A GAUSSIAN MIXTURE MODEL TO DATA WITH MISSING VALUES. # # The data is a matrix of real values in which some values may be NA. The # data is modeled by a mixture of Gaussians with diagonal covariance matrix. # The mixture model with K components has the following parameters: # # pi vector of mixture component probabilities # mu matrix with k'th row containing the mean vector for component k # sigma matrix with k'th row containing the vector of standard deviations # for component k # Find the mixture model log likelihood for given values of the parameters. log.lik <- function (x, pi, mu, sigma) { K <- length(pi) wh <- !is.na(x) ll <- 0 for (i in 1:nrow(x)) { w <- wh[i,] pr <- 0 for (k in 1:K) { pr <- pr + pi[k] * prod(dnorm(x[i,w],mu[k,w],sigma[k,w])) } ll <- ll + log(pr) } ll } # Fit a mixture model with EM. The parameters are the data, the number # of mixture components, the number of EM iterations to do, a matrix # of initial responsibilities of mixture components for observations, which # is set randomly by default, and a "print" flag for whether to print details # each iteration (default FALSE). The data may have NA values, but to ensure # that both means and standard deviations can be estimated, for each variable # there there must be at least two cases with non-missing values for that # variable. The value returned is a list with the pi, mu, and sigma # parameters, plus a vector "ll" of log likelihood values after each M step. mix.na <- function (x, K, iters, r = matrix(runif(K*nrow(x),0.1,1),nrow(x),K), print = FALSE) { N <- nrow(x) if (nrow(r)!=N || ncol(r)!=K) { stop("Matrix of responsibilities is the wrong size") } wh <- !is.na(x) if (any(colSums(wh)<2)) { stop("One or more variables has less than two non-missing values") } # Make sure initial responsibilities sum to one. for (i in 1:N) { r[i,] <- r[i,] / sum(r[i,]) } # Do EM iterations, starting with the M step, then the E step. mu <- matrix(NA,K,ncol(x)) sigma <- matrix(NA,K,ncol(x)) ll <- rep(NA,iters) for (t in 1:iters) { if (print) { cat("\nITERATION",t,"\n") cat("\nr:\n") print(round(r,3)) } # M step. rt <- colSums(r) pi <- rt / sum(rt) for (j in 1:ncol(x)) { w <- wh[,j] for (k in 1:K) { mu[k,j] <- sum(r[w,k]*x[w,j]) / sum(r[w,k]) sigma[k,j] <- sqrt(sum(r[w,k]*(x[w,j]-mu[k,j])^2) / sum(r[w,k])) } } if (print) { cat("\npi:\n") print(round(pi,3)) cat("mu:\n") print(round(mu,3)) cat("sigma:\n") print(round(sigma,3)) } ll[t] <- log.lik(x,pi,mu,sigma) if (print) { cat("log likelihood:",ll[t],"\n") } # E step. for (i in 1:N) { w <- wh[i,] for (k in 1:K) { r[i,k] <- pi[k] * prod(dnorm(x[i,w],mu[k,w],sigma[k,w])) } r[i,] <- r[i,] / sum(r[i,]) } } list (pi=pi, mu=mu, sigma=sigma, r=r, ll=ll) } # Fill in missing values with their conditional mean given non-missing values, # using a mixture model. The arguments are the data and a list with the # mixture model parameters. The value returned is the data with the missing # values filled in. mix.fill <- function (x, par) { K <- length(par$pi) wh <- !is.na(x) r <- numeric(K) # Loop over all cases. for (i in 1:nrow(x)) { # Find conditional probability of this case coming from each mixture # component, given the values that are not missing. w <- wh[i,] for (k in 1:K) { r[k] <- par$pi[k] * prod(dnorm(x[i,w],par$mu[k,w],par$sigma[k,w])) } r <- r / sum(r) # Fill in missing values with their mean based on the probabilities # of the mixture components. for (j in 1:ncol(x)) { if (!w[j]) { x[i,j] <- sum(r*par$mu[,j]) } } } x }