#import the data (make sure you have the proper working directory) ibex=read.delim("ibex horns.txt") #check the dimension of the data dim(ibex) #look at the initial portion of the data head(ibex) #remove the last 6 columns (they appear to be empty) ibex=ibex[,1:12] #look at the last portion of the data tail(ibex) #remove the last 4 lines keping all the columns (these also appear to be empty) ibex=ibex[1:3050,] #look at all the data for ibex number 672 ibex[ibex$id==672,] #we note that this ibex was entered twice by "mistake" (two measurements) #replace the two almost replica by their average tempibex=(ibex[1074+2*(0:7),]+ibex[1075+2*(0:7),])/2 #replace the newly created data (the average) in the original dataset ibex[1074:(1074+7),]=tempibex #remove the replicated rows ibex=ibex[-((1074+8):(1074+15)),] #check that we did the right thing ibex[ibex$id==672,] #create the dependent variable (the one that I am to explain): log of ibex growth y=log(ibex$c.dim) #create the explanatory variables = a matrix made of columns of covariates #first create the intercept ones=rep(1,length(y)) #then put together the other explanatory variables x=cbind(ones,log(ibex$accr/(1+ibex$accr)),ibex$year,ibex$surv1,ibex$ril) #eliminate rows with missing values using the command #that returns a vector of T and F is.na(ibex$sx)[1070:1085] #to remove the missing entries (recall to use == when using logical conditions) ibex[is.na(ibex$sx)==FALSE,] #test which columns contain missing entries #and count, for each variable, the number of missing values apply(is.na(ibex),2,sum) #fit a classical linear regression model (remove the intercept with the -1 command #since the intercept is already included in the x matrix model=lm(y~x-1) #give names to the columns of the explanatory variables in the x matrix colnames(x)=c("intercept","age","year","survival","surveyor") colnames(x) #now look at the output of the fitted model summary(model) #note: we have repated measures here, so we should use nmle : load the package "nmle" #and run the following command to have a random intercept for each individual summary(lme(y~x-1,random=~1|id,ibex)) #use the code made for Bayesian Core source("progs.3.R") n=length(y) X=x k=dim(X)[2] #usual lse betahat=inv(t(X)%*%X)%*%t(X)%*%y betahat # 1/c = prior sample size # c=.1 -> prior guess corresponds to 10 obs (strong prior information) # c=1000 -> very little prior information, the estimators become more and more similar to the MLE #look at the effect of different c on the posterior expectation of sigma^2 # c=0.1 S2=t(y-X%*%betahat)%*%(y-X%*%betahat) a=2.1 b=2 M=10*diag(5) T=inv(inv(M)+inv(t(X)%*%X)) (2*b+S2+t(betahat)%*%T%*%betahat)/(n+2*a-2) # c=1 M=diag(5) T=inv(inv(M)+inv(t(X)%*%X)) (2*b+S2+t(betahat)%*%T%*%betahat)/(n+2*a-2) # c=10 M=0.1*diag(5) T=inv(inv(M)+inv(t(X)%*%X)) (2*b+S2+t(betahat)%*%T%*%betahat)/(n+2*a-2) # c=100 M=0.01*diag(5) T=inv(inv(M)+inv(t(X)%*%X)) (2*b+S2+t(betahat)%*%T%*%betahat)/(n+2*a-2) # c=1000 M=0.001*diag(5) T=inv(inv(M)+inv(t(X)%*%X)) (2*b+S2+t(betahat)%*%T%*%betahat)/(n+2*a-2) #look at the effect of different c on the posterior expectation of beta # c=1000 M=0.001*diag(5) b5beta=inv(M+t(X)%*%X)%*%t(X)%*%y b5beta[1] # compare with MLE for beta_1 betahat[1] [1] 9.702976 #if we do not believe much in our prior we should pick c to be very large # ZELLNER INFORMATIVE G-PRIOR ANALYSIS: POSTERIOR MEAN OF beta FOR c=100 100/101*betahat # POSTERIOR VARIANCE OF beta FOR c=1000 diag(1000/(n*1001)*as.real(S2+t(betahat)%*%t(X)%*%X%*%betahat/101)*inv(t(X)%*%X)) # compare with formulas on p. 38 with tilde(beta) = 0 (due to the prior mean on beta we picked) # c has still an impact but at least things are on the right scale since we used X^tX in our prior # this can be used 9after taking the sqrt) to construct confidence intervals sqrt(diag(10000/(n*10001)*as.real(S2+t(betahat)%*%t(X)%*%X%*%betahat/10001)*inv(t(X)%*%X))) # CONFIDENCE INTERVALS # diagonal elements in the posterior estimate of the variance coariance matrix # 2.196655093 0.052934235 0.001101347 0.009177800 0.002363016 c=1000 (c/(c+1))*betahat[1]-2.196655093 * qt(0.975,df=n) (c/(c+1))*betahat[1]+2.196655093 * qt(0.975,df=n) # since the posterior variance for the intercept # is so big compared to the variance of the other estimated parameters # (that are of the order of 0.0 somethings) # we should not trust the estimated value of the intercept #to simulate from a multivariate normal distirbution # need to load the library MASS library(MASS) # to generate a sample from the posterior of beta (notice c = 100 in the sequel) # first generate the sigma sigmasample = 1/rgamma(10^4,n/2,(S2+t(betahat)%*% t(X) %*% X %*% betahat/101)/2) hist(sigmasample,nclass=100) # then create an empty matrix of betas betasample=matrix(0,nro=10^4,ncol=5) #finally fill in the matrix by sampling form the proper posterior distirbution for (t in 1:10^4) {betasample[t,] = mvrnorm(1,mu=(100/101)*betahat,Sigma=sigmasample[t]*(100/101)*inv(t(X) %*% X)) } # and plot the samples form the posterior par(mfrow=c(2,3)) for (i in 1:5){ hist(betasample[,i],nclass=110,col="grey", prob = TRUE); abline(v=(100/101)*betahat[i],lwd=2,col="gold" ) } hist(sigmasample,n=110,col=" grey" , prob=TRUE,main=expression(sigma^ 2)) # the sampled posterior mean i.e. the bayes estimator is apply(betasample, 2, mean) #that compares well with the true estimator apply(betasample, 2, mean) - (100/101)*betahat