# STA 414, 2011 ASSIGNMENT 3.
#
# FUNCTIONS FOR FITTING A GAUSSIAN MIXTURE MODEL TO DATA WITH MISSING VALUES.
#
# The data is a matrix of real values in which some values may be NA.  The
# data is modeled by a mixture of Gaussians with diagonal covariance matrix.
# The mixture model with K components has the following parameters:
#
#    pi     vector of mixture component probabilities
#    mu     matrix with k'th row containing the mean vector for component k
#    sigma  matrix with k'th row containing the vector of standard deviations
#           for component k


# Find the mixture model log likelihood for given values of the parameters.

log.lik <- function (x, pi, mu, sigma)
{
  K <- length(pi)
  wh <- !is.na(x)

  ll <- 0

  for (i in 1:nrow(x))
  { w <- wh[i,]
    pr <- 0
    for (k in 1:K)
    { pr <- pr + pi[k] * prod(dnorm(x[i,w],mu[k,w],sigma[k,w]))
    }
    ll <- ll + log(pr)
  }

  ll
}


# Fit a mixture model with EM.  The parameters are the data, the number
# of mixture components, the number of EM iterations to do, a matrix
# of initial responsibilities of mixture components for observations, which
# is set randomly by default, and a "print" flag for whether to print details
# each iteration (default FALSE).  The data may have NA values, but to ensure
# that both means and standard deviations can be estimated, for each variable
# there there must be at least two cases with non-missing values for that
# variable.  The value returned is a list with the pi, mu, and sigma
# parameters, plus a vector "ll" of log likelihood values after each M step.

mix.na <- function (x, K, iters,
                    r = matrix(runif(K*nrow(x),0.1,1),nrow(x),K),
                    print = FALSE)
{ 
  N <- nrow(x)

  if (nrow(r)!=N || ncol(r)!=K)
  { stop("Matrix of responsibilities is the wrong size")
  }

  wh <- !is.na(x)
  if (any(colSums(wh)<2))
  { stop("One or more variables has less than two non-missing values")
  }

  # Make sure initial responsibilities sum to one.

  for (i in 1:N)
  { r[i,] <- r[i,] / sum(r[i,])
  }

  # Do EM iterations, starting with the M step, then the E step.

  mu <- matrix(NA,K,ncol(x))
  sigma <- matrix(NA,K,ncol(x))
  ll <- rep(NA,iters)

  for (t in 1:iters)
  {
    if (print)
    { cat("\nITERATION",t,"\n")
      cat("\nr:\n")
      print(round(r,3))
    }

    # M step.

    rt <- colSums(r)
    pi <- rt / sum(rt)

    for (j in 1:ncol(x))
    { w <- wh[,j]
      for (k in 1:K)
      { mu[k,j] <- sum(r[w,k]*x[w,j]) / sum(r[w,k])
        sigma[k,j] <- sqrt(sum(r[w,k]*(x[w,j]-mu[k,j])^2) / sum(r[w,k]))
      }
    }

    if (print)
    { cat("\npi:\n")
      print(round(pi,3))
      cat("mu:\n")
      print(round(mu,3))
      cat("sigma:\n")
      print(round(sigma,3))
    }

    ll[t] <- log.lik(x,pi,mu,sigma)

    if (print)
    { cat("log likelihood:",ll[t],"\n")
    }

    # E step.

    for (i in 1:N)
    { w <- wh[i,]
      for (k in 1:K)
      { r[i,k] <- pi[k] * prod(dnorm(x[i,w],mu[k,w],sigma[k,w]))
      }
      r[i,] <- r[i,] / sum(r[i,]) 
    }
  }

  list (pi=pi, mu=mu, sigma=sigma, r=r, ll=ll)
}


# Fill in missing values with their conditional mean given non-missing values,
# using a mixture model.  The arguments are the data and a list with the
# mixture model parameters.  The value returned is the data with the missing
# values filled in.

mix.fill <- function (x, par)
{
  K <- length(par$pi)
  wh <- !is.na(x)
  r <- numeric(K)

  # Loop over all cases.

  for (i in 1:nrow(x))
  { 
    # Find conditional probability of this case coming from each mixture 
    # component, given the values that are not missing.

    w <- wh[i,]
    for (k in 1:K)
    { r[k] <- par$pi[k] * prod(dnorm(x[i,w],par$mu[k,w],par$sigma[k,w]))
    }
    r <- r / sum(r) 

    # Fill in missing values with their mean based on the probabilities
    # of the mixture components.

    for (j in 1:ncol(x))
    { if (!w[j])
      { x[i,j] <- sum(r*par$mu[,j])
      }
    }
  }

  x
}