#########
#### Likelihood-free Methods
#### Christian P. Robert 
#########

########
### Example 1: Conjugate model (Normal-Inverse Gamma)
########

### Let y1,y2,...,yn ~N(mu,sigma2)
###     mu|sigma2 ~ N(0,sigma2)
###     sigma2    ~IG(1/2,1/2)

library(abc)

help(abc)

### Iris data: sepal width of Iris Setosa

data(iris3)
help(iris3)
y=iris3[,2,1]

#### We want to obtain the following quantities
### "par.sim"     "post.mu"     "post.sigma2" "stat.obs"    "stat.sim"   

## STAT.OBS:  stat.obs are the mean and the variance (on log scale) of the
## data

mean(y)
log(var(y))
stat.obs

stat.oss=c(mean(y),log(var(y)))

### PAR.SIM: par.sim simulated values from the prior distribution

n.sim=10000
gdl=1
invsigma.sim=rchisq(n.sim,df=gdl)
sigma.sim=gdl/invsigma.sim

m=3
mu.sim=c()
for(i in 1:length(sigma.sim)){
mu.sim=c(mu.sim,rnorm(1,mean=m,sd=sqrt(((sigma.sim[i])))))}

prior.sim=data.frame(mu.sim,sigma.sim)

### STAT.SIM: for mu and sigma simulated from the prior, generate data
###           from the model y ~ N(mu,sigma^2)

stat.simulated=matrix(NA,nrow=length(mu.sim),ncol=2)

for(i in 1:length(mu.sim)){
y.new=rnorm(length(y),mu.sim[i],sqrt(sigma.sim[i]))
stat.simulated[i,1]=mean(y.new)
stat.simulated[i,2]=log(var(y.new))
}

### Obtain posterior distribution using ABC

post.value=abc(target=stat.oss, param=prior.sim,sumstat=data.frame(stat.simulated),tol=0.001,method="rejection")

summary(post.value)

posterior.values=post.value$unadj.values
mu.post=posterior.values[,1]
sigma.post=posterior.values[,2]

### True values:
## thanks to conjugancy, we can derive this values analytically
 
post.mean.mu=(length(y)/(length(y)+1))*mean(y)
post.a.sigma=length(y)/2
post.b.sigma=0.5+0.5*sum((y-mean(y))^2)

hist(mu.post,main="Posterior distribution of mu")
abline(v=post.mean.mu,col=2,lwd=2)

hist(sigma.post,main="Posterior distribution of sigma2")
abline(v=post.b.sigma/(post.a.sigma-1),col=2,lwd=2)

### what does it happen changing the tolerance level or increasing the number
## of simulations?


########
### Example 2: Conditional conjugancy (Normal - Normal Gamma)
########

### Let: y1,y2,...,yn ~ N(mu, 1/w);
###                mu ~ N(mu0,1/k0);
###             omega ~ Gamma(alpha0,lambda0)

### Gibbs sampling function

gibbs= function (data,mu0=0,kappa0=0.1,alpha0=0.1,lambda0=0.1,nburn=0,ndraw=5000)
{

#Gibbs sampling for model:

n <- length(data)
sumy <- sum(data)
alpha1 <- alpha0 + (n/2)

#Step 0: initial values drawn from prior

mu <- rnorm(1,mu0,sqrt(1/kappa0))
omega <- rgamma(1,alpha0,1)/lambda0

# matrix that will contain recorded draws:
draws <- matrix(ncol=2,nrow=ndraw)

# MCMC loop:
it <- -nburn
while(it < ndraw){ it <- it+1;
# draw mu:
var <- 1/(kappa0+omega*n)
mu1 <- (kappa0*mu0 + omega*sumy)*var
mu <- rnorm(1,mu1,sqrt(var))
# draw omega:
omega <- rgamma(1,alpha1,1)/(lambda0 + (sum((data - mu)^2))/2 )
# after burn-in record mu and omega:
if(it>0){
draws[it,1] <- mu
draws[it,2] <- omega
}
}
# end MCMC loop
return(draws)
} 

#### Data generation Y~N(5,5)

n.val=50
y=rnorm(n.val,5,sqrt(5))

# apply gibbs function

draws=gibbs(data=y,mu0=3,kappa0=1,alpha0=0.1,lambda0=0.1,nburn=0,ndraw=5000)
dim(draws)

plot(draws[seq(1,5000,10),1],type='l',main="Posterior distribution of mu")
plot(draws[seq(1,5000,10),2],type='l',main="Posterior distribution of omega")

## plot prior and posterior distributions

mu <- draws[c(1001:5000),1]
omega <- draws[c(1001:5000),2]

par(mfrow=c(1,2))
x <- seq(-1,7,length=200)
plot(x,dnorm(x,3,1),type="l")
title("prior dist of mu")
hist(mu,probab=T,main="Posterior distribution of mu")

x <- seq(0.01,5,length=200)
plot(x,dgamma(x,0.1,rate=0.1),type="l")
title("prior dist of w")
hist(omega,probab=T,main="Posterior distribution of omega")

## compute the predictive distribution
x <- seq(-5,15,length=200)
pred <- rep(0,200)
for(j in 1:200){
pred[j] <- mean(dnorm(x[j],mu,1/sqrt(omega)))}

par(mfrow=c(1,1))
plot(x,pred,type="l",xlim=c(-5,15))
title("predictive density: p(y_f|y)")

# question: what it the probability that 
# a future observable lies in the interval (0, 5)?

ynew <- rnorm(length(mu),mu,1/sqrt(omega))
mean(ynew<5 & ynew>0)
hist(ynew, main="Predictive distribution")

#### Let's try using ABC

#### We want to obtain the following quantities
### "par.sim"     "post.mu"     "post.sigma2" "stat.obs"    "stat.sim"   

## STAT.OBS:  stat.obs are the mean and the variance (on log scale) of the
## data

mean(y)
log(var(y))

stat.oss=c(mean(y),log(var(y)))

### PAR.SIM: par.sim simulated values from the prior distribution

n.sim=10000

invsigma.sim=rgamma(n.sim,0.1,0.1)
sigma.sim=1/invsigma.sim

m=3
mu.sim=c()
for(i in 1:length(sigma.sim)){
mu.sim=c(mu.sim,rnorm(1,mean=m,sd=1))}

prior.sim=data.frame(mu.sim,sigma.sim)

### STAT.SIM: for mu and sigma simulated from the prior, generate data
###           from the model y ~ N(mu,sigma^2)

stat.simulated=matrix(NA,nrow=length(mu.sim),ncol=2)

for(i in 1:length(mu.sim)){
y.new=rnorm(length(y),mu.sim[i],sqrt(sigma.sim[i]))
stat.simulated[i,1]=mean(y.new)
stat.simulated[i,2]=log(var(y.new))
}

### Obtain posterior distribution using ABC

post.value=abc(target=stat.oss, param=prior.sim,sumstat=data.frame(stat.simulated),tol=0.01,method="rejection")

posterior.values=post.value$unadj.values
mu.post=posterior.values[,1]
sigma.post=posterior.values[,2]

par(mfrow=c(1,2))
hist(mu.post,main="ABC: posterior distribution of mu")
box()
hist(mu,main="Gibbs sampling: posterior distribution of mu")
box()
summary(mu.post)
summary(mu)

par(mfrow=c(1,2))
hist(sigma.post,main="ABC: posterior distribution of sigma")
box()
hist(1/omega,main="Gibbs sampling: posterior distribution of omega")
box()
summary(sigma.post)
summary(1/omega)

## Notice that ...
length(mu.post)
length(mu)

########
########
### MA(2) model (paper "Approximate Bayesian 
###              Computational methods", arxiv 27May 2011)
########
########

### Example on p. 5

### Data simulation

n=100
theta=c(0.6,0.2)
mu=rnorm(n,0,1)
y=rep(NA,n)
y[1]=mu[1]
y[2]=mu[2]+theta[1]*mu[1]
for(i in 3:n){
y[i]=mu[i]+theta[1]*mu[i-1]+theta[2]*mu[i-2]
}

true.data=y

### Implementing ABC method

n.sim=10^4

# target:  a vector of observed summary statistics
# param:   simulation from the prior distribution
# sumstat: summary statistics of the simulated data

# As summary statistics, Robert et al. (2011) used autocovariances
# defined as follows

stat1.obs=0
stat2.obs=0

h=1
stat1.obs=sum(y[(h+1):n]*y[1:(n-h)])
h=2
stat2.obs=sum(y[(h+1):n]*y[1:(n-h)])

stat.obs=c(stat1.obs,stat2.obs)

# simulation from the prior: identifiability contraint
# theta1 ~ Unif(-2,2)
# theta2+theta1>-1; theta1-theta2<1

k=10
theta1.sim=runif(n.sim*k,-2,2)
theta2.sim=runif(n.sim*k,-1,1)

#check ...

s=theta1.sim+theta2.sim
d=theta1.sim-theta2.sim

theta1.sim=theta1.sim[which(s>-1 & d<1)[1:n.sim]]
theta2.sim=theta2.sim[which(s>-1 & d<1)[1:n.sim]]

#plot(theta1.sim,theta2.sim)

par.sim=data.frame(theta1.sim,theta2.sim)

### summary statistics based on simulated data

sum.stat=matrix(NA,nrow=1,ncol=2)

for(i in 1:n.sim){
mu=rnorm(n,0,1)
y.new=rep(NA,n)
y.new[1]=mu[1]
y.new[2]=mu[2]+theta1.sim[i]*mu[1]
for(j in 3:n){
y.new[j]=mu[j]+theta1.sim[i]*mu[j-1]+theta2.sim[i]*mu[j-2]
}
stat1.obs=0
stat2.obs=0
h=1
stat1.obs=sum(y.new[(h+1):n]*y.new[1:(n-h)])
h=2
stat2.obs=sum(y.new[(h+1):n]*y.new[1:(n-h)])
sum.stat=rbind(sum.stat,c(stat1.obs,stat2.obs))
}

sum.stat=sum.stat[-c(1),]

post.val=abc(target=stat.obs, param=par.sim, sumstat=data.frame(sum.stat), 
tol=0.01, method="rejection")

hist(post.val)

## notice that true values are (0.6,0.2)

theta1.sim=post.val$unadj.values[,1]
theta2.sim=post.val$unadj.values[,2]

plot(theta1.sim,theta2.sim,xlim=c(-2,2),ylim=c(-1,1),xlab="theta1",ylab="theta2")
# triangle: range of acceptable values of theta
polygon(x=c(-2,0,2),y=c(1,-1,1),col="lightgreen",border="lightgreen")
points(theta1.sim,theta2.sim,pch=19)
points(theta[1],theta[2],col="red",pch=19)


########
#### First steps in likelihood-free methods:
#### the choice of the kernel
########

# target posterior distribution: theta|y ~ N(0,1)

# Two different Kernels:

## uniform Kernel

approx1=function(theta,epsilon){
ff=pnorm(epsilon-theta)-pnorm(-epsilon-theta)
return(ff/(2*epsilon))}

## Gaussian Kernel

#### 1: epsilon=sqrt(3)

epsilon=sqrt(3)
theta.sim=seq(-5,5,length=1000)
val.1=approx1(theta=theta.sim,epsilon=epsilon)

curve(dnorm(x,0,1),xlim=c(-3,3),lwd=2,xlab="Theta",ylab="Density",
main=expression(epsilon==sqrt(3)))
points(theta.sim,val.1,col=2,type='l',lwd=2,lty=3)
curve(dnorm(x,mean=0,sd=sqrt(1+(epsilon^2)/3)),add=T,col=4,lwd=2,lty=2)
legend(1,0.4,col=c(1,2,4),legend=c("Target posterior","Uniform Kernel","Gaussian Kernel"),lty=c(1,3,2),lwd=2)

#### 2: epsilon=sqrt(3)/2


epsilon=sqrt(3)/2
theta.sim=seq(-5,5,length=1000)
val.1=approx1(theta=theta.sim,epsilon=epsilon)

curve(dnorm(x,0,1),xlim=c(-3,3),lwd=2,xlab="Theta",ylab="Density",
main=expression(epsilon==sqrt(3)/2))
points(theta.sim,val.1,col=2,type='l',lwd=2,lty=3)
curve(dnorm(x,mean=0,sd=sqrt(1+(epsilon^2)/3)),add=T,col=4,lwd=2,lty=2)
legend(1,0.4,col=c(1,2,4),legend=c("Target posterior","Uniform Kernel","Gaussian Kernel"),lty=c(1,3,2),lwd=2)


#### 2: epsilon=sqrt(3)/10

epsilon=sqrt(3)/10
theta.sim=seq(-5,5,length=1000)
val.1=approx1(theta=theta.sim,epsilon=epsilon)

curve(dnorm(x,0,1),xlim=c(-3,3),lwd=2,xlab="Theta",ylab="Density",
main=expression(epsilon==sqrt(3)/10))
points(theta.sim,val.1,col=2,type='l',lwd=2,lty=3)
curve(dnorm(x,mean=0,sd=sqrt(1+(epsilon^2)/3)),add=T,col=4,lwd=2,lty=2)
legend(1,0.4,col=c(1,2,4),legend=c("Target posterior","Uniform Kernel","Gaussian Kernel"),lty=c(1,3,2),lwd=2)

#### Inefficient summary statistics
#### If we consider a different summary statistics, Pettitt showed that
#### in the case of the Gaussian Kernel, the likelihood free approximation
#### is N(0, (1+e)+epsilon^2/3)

epsilon=sqrt(3)
e=0.5

theta.sim=seq(-5,5,length=1000)
val.1=approx1(theta=theta.sim,epsilon=epsilon)

curve(dnorm(x,0,1),xlim=c(-3,3),lwd=2,xlab="Theta",ylab="Density",
main=expression(epsilon==sqrt(3)))
points(theta.sim,val.1,col=2,type='l',lwd=2,lty=3)
curve(dnorm(x,mean=0,sd=sqrt(1+(epsilon^2)/3)),add=T,col=4,lwd=2,lty=2)
curve(dnorm(x,mean=0,sd=sqrt((1/e)+(epsilon^2)/3)),add=T,col=3,lwd=2,lty=4)

legend(0.8,0.4,col=c(1,2,4,3),legend=c("Target posterior","Uniform Kernel","Gaussian Kernel","Inefficient T(x) e=0.5"),lty=c(1,3,2,4),lwd=2)

##### epsilon=sqrt(3)/10

epsilon=sqrt(3)/10
e=0.5

theta.sim=seq(-5,5,length=1000)
val.1=approx1(theta=theta.sim,epsilon=epsilon)

curve(dnorm(x,0,1),xlim=c(-3,3),lwd=2,xlab="Theta",ylab="Density",
main=expression(epsilon==sqrt(3)/10))
points(theta.sim,val.1,col=2,type='l',lwd=2,lty=3)
curve(dnorm(x,mean=0,sd=sqrt(1+(epsilon^2)/3)),add=T,col=4,lwd=2,lty=2)
curve(dnorm(x,mean=0,sd=sqrt((1/e)+(epsilon^2)/3)),add=T,col=3,lwd=2,lty=4)

legend(0.8,0.4,col=c(1,2,4,3),legend=c("Target posterior","Uniform Kernel","Gaussian Kernel","Inefficient T(x) e=0.5"),lty=c(1,3,2,4),lwd=2)

### different e values

e=seq(0.1,2,length=10)
curve(dnorm(x,0,1),xlim=c(-3,3),lwd=2,xlab="Theta",ylab="Density",
main=expression(epsilon==sqrt(3)/10),ylim=c(0,0.6))
for(i in 1:length(e)){
curve(dnorm(x,mean=0,sd=sqrt((1/e[i])+(epsilon^2)/3)),add=T,col="grey",lwd=2,lty=4)
}