\documentclass[a4paper,12pt]{article}
\usepackage[latin1]{inputenc}
\usepackage{amscd,amsmath,amssymb,amsfonts,latexsym,amsthm,mathrsfs}
\usepackage{graphicx,color,fancyheadings,framed}
\usepackage{rcs}
\usepackage{natbib}
\bibliographystyle{apalike}
% \usepackage[notref,notcite]{showkeys}
%\textwidth 160mm
%\oddsidemargin -5mm
%\evensidemargin -5mm
%\textheight 247mm
%\topmargin -12mm
% \pagestyle{fancy}
\renewcommand{\headrulewidth}{0pt}
% Avoid spurious dots in enumerate environment
\renewcommand\labelenumi{\theenumi}
\newcommand{\E}{\mathbb{E}}
\newcommand{\V}{\mathbb{V}}
\renewcommand{\P}{\mathbb{P}}
\newcommand{\1}[1]{\mathbf{1}_{\left\{ #1\right\}}}
\newcommand{\calF}{{\mathcal F}}
\newcommand{\X}{\Omega} %{{\mathbf X}}
\newcommand{\Klbck}{{\mathcal E}}
\newcommand{\Inv}{{\mathcal I}}
\newcommand{\Rset}{{\mathbb R}}
\newcommand{\note}[1]{\marginpar{\tiny{#1}}}
\newcommand{\aux}[3]{#1^{#2}_{#3}}
\newcommand{\ds}{\displaystyle}
\newcommand{\vsl}{\vspace{1cm}}
\newcommand{\vs}{\vspace{0.5cm}}
\renewcommand{\vss}{\vspace{0.25cm}}
\renewcommand{\ni}{\noindent}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{prop}{Proposition}[section]
\newtheorem{corollary}{Corollary}[section]
\newtheorem{definition}{Definition}[section]
\newtheorem{ex}{Example}
\newtheorem{rem}{Remark}
%\parindent 0cm
\begin{document}%--------------------------------------------------------------------------------------------------
\renewcommand{\baselinestretch}{1}
\huge\normalsize
\title{Convergence of adaptive sampling schemes}
\author{R. Douc\footnote{CMAP, \'Ecole Polytechnique, Palaiseau {\sf douc@cmapx.polytechnique.fr}} \hspace{0.05cm},
A. Guillin\footnote{CEREMADE, Université Paris Dauphine, and TSI, \'Ecole Nationale des Télécom\-munications {\sf guillin@ceremade.dauphine.fr}} \hspace{0.05cm},
J.M. Marin\footnote{Projet SELECT, INRIA FUTURS, Université Paris-Sud and CEREMADE, Université Paris Dauphine {\sf marin@ceremade.dauphine.fr}} \hspace{0.05cm},
and C.P. Robert\footnote{Corresponding author: CEREMADE, Université Paris Dauphine, and CREST, INSEE, Paris {\sf xian@ceremade.dauphine.fr}}}
\maketitle
\begin{abstract} In the design of efficient simulation algorithms, one is often beset with a poor choice of
proposal distributions. Although the performances of a given kernel can clarify how adequate it is for the problem at
hand, a permanent on-line modification of kernels causes concerns about the validity of the resulting algorithm. While
the issue is quite complex and most often intractable for MCMC algorithms, the equivalent version for importance sampling
algorithms can be validated quite precisely. We derive sufficient convergence conditions for a wide class of population
Monte Carlo algorithms and show that Rao--Blackwellized versions asymptotically achieve an optimum in terms of a Kullback
divergence criterion, while more rudimentary versions simply do not benefit from repeated updating.
\vss\noindent{\bf Keywords:}
Adaptivity,
Bayesian Statistics,
Central Limit Theorem,
% glory & fame
importance sampling,
Kullback divergence,
Law of Large Numbers,
MCMC algorithm,
population Monte Carlo,
proposal distribution,
Rao-Black\-wellization.
\end{abstract}%--------------------------------------------------------------------------------------------------
\section{Introduction}
In the simulation settings found in optimization and (Bayesian) integration, it is well-documented \citep{Robert:Casella:2004}
that the choice of the instrumental distributions is paramount for the efficiency of the resulting algorithms. Indeed,
whether we are considering implementing a Metropolis--Hasting algorithm with proposal density $q(x|y)$
or an importance sampling algorithm with importance function $g(x)$, we are relying on a distribution
that is customarily difficult to calibrate, outside a limited range of well-known cases. For instance, a standard
result is that the optimal importance density for approximating an integral
$$
\mathfrak{I} = \int f(x) \pi(x) dx
$$
is $g^\star(x) \propto |f(x)|\,\pi(x)$ \citep[][Theorem 3.12]{Robert:Casella:2004},
but this formal result is not very informative about the practical choice of $g$,
while a poor choice of $g$ may result in an infinite variance estimator. Similarly, it has been established by
\cite{Mengersen:Tweedie:1996} that the choice of the transition kernel $q(x|y)$ in the Metropolis--Hastings
algorithm is crucial for the resulting convergence speed of the Markov chain.
While the goals of simulating experiments
are multifaceted and therefore the efficiency of an algorithm can be evaluated under many different perspectives, a measure of
agreement between the target and the proposal distribution can serve as a proxy in many cases: In the nomenclature designed in
\cite{Andrieu:Robert:2001}, examples of such measures are moments, acceptance rates and autocorrelations.
% More details
An even more robust measure is the Kullback divergence, which is ubiquitous in statistical approximation theory \citep{Csizar:Tusnady:1984}
and which will be used in this paper.
% Adaptive is difficult...
Given the complexity of the original optimization or integration problem (which does require Monte Carlo approximations),
it is rarely the case that the optimization of the proposal distribution against an efficiency measure can be achieved in
closed form. Even the computation of the efficiency measure for a given proposal is impossible in the majority of cases.
For this reason, a number of adaptive schemes have appeared in the recent literature \citep[][Section 7.6.3]{Robert:Casella:2004}, in
order to design better proposals against a given measure of efficiency without resorting to a standard optimization
algorithm. For instance, in the MCMC community, sequential changes in the variance of Markov kernels have been proposed in
\cite{Haario:Sacksman:Tamminen:1999,Haario:Sacksman:Tamminen:2001}, while adaptive changes taking advantage of regeneration
properties of the kernels have been constructed by \cite{Gilks:Roberts:Sahu:1998}
and \cite{Sahu:Zhigljavsky:1998,Sahu:Zhigljavsky:2003}. In a more general perspective,
\cite{Andrieu:Robert:2001} develop a two-level stochastic optimization scheme to update parameters of a proposal towards a
given integrated efficiency criterion like the acceptance rate (or its difference with a value known to be
optimal, \citealp[see][]{Roberts:Gelman:Gilks:1997}). As reflected by this general technical report of \cite{Andrieu:Robert:2001},
the complexity of devising valid adaptive MCMC schemes is
however a genuine drawback in their extension, given that the constraints on the inhomogeneous Markov chain that results from this
adaptive construction either are difficult to satisfy or result in a fixed proposal after a certain number of iterations.
%...but beautiful and dutiful and wonderful...
As stressed in \cite{Cappe:Guillin:Marin:Robert:2004} \citep[see also][Chap. 14]{Robert:Casella:2004}, the importance
sampling perspective is much more amenable to adaptivity than MCMC, due to its unbiased nature: %Crime de lese-Bayesien !
using sampling importance resampling \citep{Rubin:1987,Rubin:1988}, any given sample from an importance distribution $g$
can be transformed in a sample of points marginally distributed from the target distribution $\pi$ and
\cite{Cappe:Guillin:Marin:Robert:2004} showed that this property is also preserved by repeated and adaptive sampling. The asymptotics of
adaptive importance sampling are therefore much more manageable than those of adaptive MCMC algorithms, at least at
a primary level, if only because the algorithm can be stopped at any time since it does not require a burn-in time.
(We will present in this paper more advanced convergence results.)
Borrowing from the sequential sampling literature \citep{Doucet:deFreitas:Gordon:2001}, \cite{Cappe:Guillin:Marin:Robert:2004}
constructed an iterative adaptive scheme christened {\em population Monte Carlo} \citep{Iba:2000} that aims at replicating the adaptivity
of MCMC kernels by a learning mechanism on a population of points, themselves marginally distributed from the target distribution.
% Existin'literature
In this paper, we establish a CLT for a general PMC scheme and derive an iterative adaptive method that converges to the
optimal proposal, the optimality being defined here in terms of Kullback--Leibler divergence.
From a probabilistic point of view, the techniques used in this paper are
related to techniques and results found in \cite{Chopin:2003}, \cite{Kuensch:2003} and \cite{Cappe:Moulines:Ryden:2005}.
In particular, the triangular array technique that is central to the CLT proofs below
can be found in \cite{Cappe:Moulines:Ryden:2005} or \cite{Douc:Moulines:2005}.
The paper is organized as follows:
We first present the algorithmic and mathematical details in Section \ref{sec:pmc1}. % and \ref{sec:pmc2}, respectively.
We evaluate the convergence properties of the basic version of PMC in Section \ref{sec:baseX}, exhibiting its limitations,
and show in Section \ref{sec:RBD} that its Rao-Blackwellized version overcomes these limitations and achieve optimality
for the Kullback-Leibler criterion developed in Section \ref{sec:Kull}. Section \ref{sec:zoli} illustrates the practical
convergence of the method on a few benchmark examples.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Presentation du PMC
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section{An adaptive importance sampling scheme: the Population Monte Carlo}\label{sec:pmc2}
\section{Population Monte Carlo}\label{sec:pmc1}
The form of Population Monte Carlo (PMC) introduced in \cite{Cappe:Guillin:Marin:Robert:2004} intrinsically is a form of
iterated sampling importance resampling (SIR), following the device of \cite{Rubin:1987,Rubin:1988}. The idea of using a
repeated form of SIR is that previous samples are informative about the connections between the proposal (importance)
and the target distributions. We stress from the start that there are very few connections with MCMC algorithms in this scheme
since
(a) PMC is not Markovian, being possibly based on the whole sequence of simulation, and
(b) PMC can be stopped at any time, being validated by the basic importance sampling identity
\citep[][equation (3.9)]{Robert:Casella:2004} rather than by a
probabilistic convergence result like the ergodic theorem. These features motivate the use of the method in setups where
off-the-shelf MCMC algorithms cannot be of help. We first recall some basic Monte Carlo techniques to define notations and goals.
\subsection{The Monte Carlo framework}\label{sub:pmc2}
On a measurable space $(\Omega,\mathcal{A})$, we consider a probability distribution $\pi$
on $(\Omega,\mathcal{A})$. We assume that $\pi$ is dominated by
a reference measure $\mu$, $\pi\ll\mu$, and also denote $\ds \pi(dx)=\pi(x)\mu(dx)$ its density. We also
suppose that $\pi$ is known up to a normalizing constant,
$$
\ds \pi(x)=\frac{\ds \tilde \pi(x)}{\ds \int \tilde \pi(x)\mu(dx)}\,,
$$
where $\tilde \pi$ is known, but the calculation of $\ds \int \tilde \pi(x)\mu(dx)<\infty$ is
intractable.
For one or several $\pi$-measurable functions $f$, we are interested in computing an approximation of
$$
\pi(f)=\int f(x)\pi(dx)%=\int f(x)\pi(x)\mu(dx)
=\frac{\ds \int f(x)\tilde \pi(x)\mu(dx)}{\ds \int \tilde \pi(x)\mu(dx)}\,,
$$
assuming that the calculation of $\ds \int f(x)\tilde \pi(x)\mu(dx)$ is also intractable.
In this setting, a standard stochastic approximation method is the Monte Carlo method, %\citep{Rubinstein:1981},
based on an iid sample $x_1,\ldots,x_N$ simulated from $\pi$, that approximates $\pi(f)$ by
$$%\begin{equation}
\hat\pi^{MC}_N(f)=N^{-1}\sum_{i=1}^Nf(x_i)\,,
$$%\end{equation}
which almost surely converges to $\pi(f)$ (as $N$ goes to infinity) by the Law of Large Numbers (LLN).
The Central Limit Theorem (CLT) implies in addition that, if $\pi(f^2)=\int f^2(x)\pi(dx)<\infty$,
$$
\sqrt{N}\left\{ \hat\pi^{MC}_N(f)-\pi(f)\right\} \stackrel{\mathscr{L}}{\leadsto} \mathcal{N}\left( 0,\mathbb{V}_\pi(f) \right)\,,
$$
where $\mathbb{V}_\pi(f)=\pi([f-\pi(f)]^2)$. Obviously, this approach requires a direct iid
simulation from $\pi$ (or $\tilde\pi$) which often is impossible. An alternative
\citep[see, e.g.,][Chap.~3]{Robert:Casella:2004} is to use importance sampling, that is, to pick
a probability distribution $\nu\ll\mu$ on $(\Omega,\mathcal{A})$ called the proposal or importance distribution,
with density also denoted by $\nu$, and to estimate $\pi(f)$ by
$$%\begin{equation}
\hat\pi^{IS}_{\nu,N}(f)=N^{-1}\sum_{i=1}^Nf(x_i)\left(\frac{\pi}{\nu}\right)(x_i).
$$%\end{equation}
If $\pi$ is also dominated by $\nu$, $\pi\ll\nu$,
$\hat\pi^{IS}_{\nu,N}(f)$ almost surely converges to $\pi(f)$. Moreover, if $\nu\left(f^2\left( \pi/\nu \right)^2\right)<\infty$,
the CLT also applies, that is,
$$
\sqrt{N}\left\{ \hat\pi^{IS}_{\nu,N}(f)-\pi(f) \right\} \stackrel{\mathscr{L}}{\leadsto}
\mathcal{N}\left(0,\mathbb{V}_\nu\left(f\frac{\pi}{\nu}\right)\right) \,.
$$
% Useful for us?
% For many $f$, a sufficient condition for $\nu\left(f^2\left(\pi/\nu\right)^2\right)<\infty$ is that ${\pi}/{\nu}$ is bounded from above.
As the normalizing constant of the target distribution $\pi$ is unknown, it is not possible to use directly the IS estimator
$\displaystyle \hat\pi^{IS}_{\nu,N}(f)$ and we need to replace it with the self-normalized version of the IS estimator,
$$%\begin{equation}
\hat\pi^{SNIS}_{\nu,N}(f)=
\left(\sum_{i=1}^N\left({\pi}/{\nu}\right)(x_i)\right)^{-1}\,\sum_{i=1}^Nf(x_i)\left({\pi}/{\nu}\right)(x_i)\,,
$$%\end{equation}
which also converges almost surely to $\pi(f)$. If $\nu\left((1+f^2)\left({\pi}/{\nu}\right)^2\right)<\infty$,
the CLT applies:
$$
\sqrt{N}\,\left\{ \hat\pi^{SNIS}_{\nu,N}(f)-\pi(f) \right\} \stackrel{\mathscr{L}}{\leadsto}
\mathcal{N}\left(0,\mathbb{V}_\nu\left\{[f-\pi(f)]\,\frac{\pi}{\nu}\right\} \right) \,.
$$
Obviously, the quality of the IS and of the SNIS approximations strongly depends on the choice of the proposal distribution $\nu$,
which is delicate for complex distributions like those that occur in high dimensional problems.
\subsection{Sampling Importance Resampling}\label{sub:sis}
The Sampling Importance Resampling (SIR) method \citep{Rubin:1987,Rubin:1988} is an extension of the IS method that achieves
simulation from $\pi$ by resampling rather than by simple reweighting. More precisely,
the SIR algorithm is held in two stages: The first stage is similar to IS and consists in generating
an iid sample $x_1,\ldots,x_N$ from $\nu$. The second stage builds a sample from $\pi$, $\tilde x_1, \ldots, \tilde x_M$,
based on the instrumental sample $x_1, \ldots, x_N$ by resampling. While there are many resampling methods
\citep[][Section 14.3.5]{Robert:Casella:2004}, the most standard (if least efficient) approach is multinomial resampling
in $x_1,\ldots,x_N$ with probabilities proportional to the importance weights
$\left[\frac{\pi}{\nu}(x_1), \ldots, \frac{\pi}{\nu}(x_N)\right]$:
$$
\tilde x_i=x_{J_i}\,,\quad\quad 1\le i\le M\,,
$$
where the random variables $(J_1,\ldots,J_M)$ are iid conditionally on $x_1,\ldots,x_N$
and distributed as
$$
\mathbb{P}\left[J_l=i|x_1,\ldots,x_N\right]=\left(\sum_{j=1}^N\frac{\pi}{\nu}(x_j)\right)^{-1}\frac{\pi}{\nu}(x_i)\,.
$$
The SIR estimator of $\pi(f)$ is then
$$%\begin{equation}
\hat\pi^{SIR}_{\nu,N,M}(f)=M^{-1}\sum_{i=1}^Mf(\tilde x_i)
$$%\end{equation}
which also converges to $\pi(f)$ since each $\tilde x_i$ is (marginally) approximatively distributed from $\pi$.
By construction, the variance of $\hat\pi^{SIR}_{\nu,N,M}(f)$ is larger than the variance of the SNIS estimator.
Indeed, the expectation of $\hat\pi^{SIR}_{\nu,N,M}(f)$ conditional on the sample $x_1,\ldots,x_N$ is equal to
$\hat\pi^{SNIS}_{\nu,N}(f)$. Note that an asymptotic analysis
of $\hat\pi^{SIR}_{\nu,N,M}(f)$ is quite delicate because of the dependencies in the SIR sample (which, again, is not an
iid sample from $\pi$).
\subsection{The Population Monte Carlo algorithm}\label{sub:pmc}
In their alternative generalization of Importance Sampling, \cite{Cappe:Guillin:Marin:Robert:2004} introduce an iterative
feature in the production of importance samples, for the purpose of adapting the importance distribution $\nu$
to the target distribution $\pi$. Iterations are indeed necessary to learn about $\pi$ from the (poor or good) performances
of earlier proposals, performances that are for instance evaluated through the distribution of the importance weights.
At iteration $t$ of the PMC algorithm, $N$ realizations are thus
simulated from a proposal distribution that is derived from the $N\times(t-1)$ previous realizations.
\cite{Cappe:Guillin:Marin:Robert:2004} show that the dependence on earlier proposals and realizations
does not jeopardize the fundamental importance sampling identity. Local and adaptive importance sampling schemes
can thus be chosen in a much wider generality than thought previously.
By introducing a temporal dimension to the selection of the importance function, an adaptive perspective
can be achieved at little cost, for a potentially large gain in efficiency.
If we introduce the $\sigma$-algebras related to the current and past simulations,
\begin{align*}
& \mathcal{F}_{N,t}=\sigma\left\{(x_{i,j},J_{i,j})_{1\leq i\leq N,0\leq j\leq t}\right\}\quad (t\geq 0) \,,\\
& \mathcal{F}^J_{N,t}=\mathcal{F}_{N,t}\bigvee \sigma\left\{((x_{i,t+1})_{1\leq i\leq N}\right\} (t\geq 0)\,,
\end{align*}
where both the $x_{i,j}$'s and the $J_{i,j}$'s are defined precisely below,
and if we set the renormalized importance weights as
$$
\bar \omega_{i,t}=\frac{\omega_{i,t}}{\sum_{j=1}^N \omega_{j,t}}\,,
$$
the generic PMC algorithm reads as follows:
\begin{framed}\begin{sf}
\noindent{\bf Generic PMC algorithm:}\\
\noindent At time 0,
\begin{enumerate}\renewcommand{\theenumi}{\alph{enumi})}
\item Generate $(x_{i,0})_{1 \leq i\leq N}$ iid according to $\nu_0$ and compute the importance weights
$\ds \omega_{i,0}=\left\{ {\pi}/{\nu_0} \right\}(x_{i,0})$;
\item Conditionally on $\sigma\left\{(x_{i,0})_{1\leq i\leq N}\right\}$, draw
$$
(J_{i,0})_{1\leq i\leq N} \stackrel{\text{iid}}{\sim}
\mathcal{M}(1,(\bar \omega_{i,0})_{1\leq i\leq N})
$$
and set $\tilde x_{i,0}=x_{J_{i,0},0}$ $(1\le i\le N)$.
\end{enumerate}
At time $t$ ($t=1,\ldots,T$)
\begin{enumerate}\renewcommand{\theenumi}{\alph{enumi})}
\item Conditionally on $\mathcal{F}_{N,t-1}$, draw independently $x_{i,t}$ according to \\
$\nu_{i,t}(\mathcal{F}_{N,t-1})$ and compute the importance weights \\
$\ds \omega_{i,t}=\left\{ {\pi}/{\nu_{i,t}(\mathcal{F}_{N,t-1})} \right\} (x_{i,t})$;
\item Conditionally on $\mathcal{F}^J_{N,t-1}$, draw
$$
(J_{i,t})_{1\leq i\leq N} \stackrel{\text{iid}}{\sim}
\mathcal{M}(1,(\bar \omega_{i,t})_{1\leq i\leq N})
$$
and set $\tilde x_{i,t}=x_{J_{i,t},t}$ $(1\le i\le N)$.
\end{enumerate}\end{sf}\end{framed}
After $T$ iterations of the previous algorithm, a PMC estimator of $\pi(f)$ is given by
\small $$
\hat\pi^{PMC}_{N,T}(f)=\left(\sum_{j=1}^{N}\left\{ \frac{\pi}{\nu_{i,T}(\mathcal{F}_{N,T-1})}\right\} (x_{j,T})\right)^{-1}
\sum_{i=1}^{N}\left\{ \frac{\pi}{\nu_{i,T}(\mathcal{F}_{N,T-1})}\right\} (x_{i,T})f(x_{i,T})\,,
$$
\normalsize although it is more efficient for all estimation purposes to average the PMC approximations over all iterations, possibly with
different weights. Note that we adopt the representation $\nu_{i,t}(\mathcal{F}_{N,t-1})$ for the importance function
to signify that the construction of the proposal distribution for the $i$-th term of the $t$-th sample is completely open,
as illustrated in \cite{Cappe:Guillin:Marin:Robert:2004}. Obviously, all adaptive schemes do not lead to an automatic
improvement of the proposal and we now consider two particular schemes where improvement does not occur and does occur,
respectively.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Le D-kernel pas interessant
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{The $D$-kernel PMC algorithm}\label{sec:baseX}
In this section, we introduce a particular PMC scheme for which $\nu_{i,t}(\mathcal{F}_{N,t-1})$ is a mixture of
$D$ different transition kernels $Q_k$ $(1\le k\le D)$ that are chosen prior to the simulation experiment, but whose
weights are proportional to their survival rates in the previous resampling step. This scheme was first proposed in
\cite{Cappe:Guillin:Marin:Robert:2004}, with the purpose that, over iterations, the algorithm
would automatically adapt the mixture to the target distribution by converging to the ``right'' weights, in a spirit similar
to the mixture adaption found in \cite{Andrieu:Robert:2001}. We will however see in this Section that this is not the case, and,
more dramatically, that this scheme is intrinsically non-adaptive.
\subsection{The algorithm}\label{sub:alGod}
We consider a family $(Q_d)_{1 \leq d \leq D}$ of $D$ transition kernels on
on $\Omega \times {\mathcal A}$ and we assume that both $\pi$ and $(Q_d(x,\cdot))_{1 \leq d \leq D, \ x \in \X}$ are dominated by the
reference measure $\mu$ introduced earlier. As above, we also set the corresponding density function and transition kernel to be
$\pi$ and $q_d(\cdot,\cdot)$ respectively, that is
$$
\forall A \in {\mathcal A}, \quad \pi(A)=\int_A \pi(x)\mu(dx), \quad Q_d(x, A)=\int_A q_{d}(x,x') \mu(dx') \,.
$$
This situation is rather common in MCMC settings where several vintage transition kernels are often available and difficult to
compare. For instance, the cycle and mixture MCMC schemes already discussed by \cite{Tierney:1994} are of this nature. We detail in
this Section and the following ones how PMC can overcome the difficulty encountered by MCMC algorithms in picking an efficient
mixture of standard kernels $\sum_d \alpha_d Q_d(x,\cdot)$.
The associated PMC algorithm then builds proposals as follows:
\begin{framed}\begin{sf} \noindent {\bf $D$-kernel PMC algorithm:}
\noindent At time 0, use the same step as in the generic PMC algorithm to produce the sample $(\tilde x_{i,0},J_{i,0})_{1\leq i\leq N}$
and set $\alpha^{1,N}_d=1/D $ for all $1\le d \le D$.
\noindent At time $t$ ($t=1,\ldots,T$),
\begin{enumerate}\renewcommand{\theenumi}{\alph{enumi})}
\item\label{stepa} Conditionally on $\sigma\left\{(\alpha^{t,N}_d)_{1 \leq d \leq D}\right\}$, generate
$$
(K_{i,t})_{1 \leq i\leq N} \stackrel{\text{iid}}{\sim} \mathcal{M}(1,(\alpha_d^{t,N})_{1\leq d\leq D})
$$
\item\label{stepb} Conditionally on $\sigma\left\{(\tilde x_{i,t-1},K_{i,t})_{1\leq i\leq N}\right\}$, generate independent
$$(x_{i,t})_{1\leq i\leq N} \sim Q_{K_{i,t}}(\tilde x_{i,t-1},\cdot)$$
and set $\ds \omega_{i,t}={\pi(x_{i,t})}/{q_{K_{i,t}}(\tilde x_{i,t-1},x_{i,t})}$;
\item\label{stepc} Conditionally on $\sigma\left\{(\tilde x_{i,t-1},K_{i,t},x_{i,t})_{1\leq i\leq N}\right\}$, generate
$$(J_{i,t})_{1\leq i\leq N}\stackrel{\text{iid}}{\sim} \mathcal{M}(1,(\bar \omega_{i,t})_{1\leq i\leq N})
$$
and set ($1\le i\le N$, $1\leq d\leq D$)
$$
\tilde x_{i,t}=x_{J_{i,t},t}\,,\quad \alpha^{t+1,N}_d=\sum_{i=1}^N\bar \omega_{i,t}\mathbb{I}_d(K_{i,t})
$$
\end{enumerate}
\end{sf}\end{framed}
Recall that $\bar \omega_{i,t}$ denotes the renormalized version of $\omega_{i,t}$.
In words, Step \ref{stepa} picks the kernel index in the mixture for each point in the sample, Step \ref{stepb}
generates the corresponding point and Step \ref{stepc} updates the weights of the $D$ kernels according to their
respective survival performances in the past round. (Since survival is directed by the importance weights, reweighting
is thus related to the respective magnitudes of the importance weights for the different kernels.) Note also that
Step \ref{stepc} is only used to avoid the dissemination of small importance weights along iterations and the
subsequent degeneracy phenomenon that plagues iterated IS schemes like particle filters. Integral approximations should
however use the byproduct of Step \ref{stepb}.
\subsection{Convergence properties}
In order to assess the average effect of these iterations, we now consider the convergence of the algorithm when the
number $N$ of points in each sample is large. Indeed, as already pointed out in \cite{Cappe:Guillin:Marin:Robert:2004},
it does not make much sense to consider the asymptotics of the PMC scheme when $T$ grows large, given that this
algorithm is intended to be run with a small number $T$ of iterations.
In order to prove convergence of the $D$ kernel PMC algorithm,
we first assume that the generalized importance weight is almost surely finite, that is,
\begin{center}
\textbf{(A1)}\hglue 1truecm $\forall d \in \{1, \ldots, D\}$, $\pi\otimes\pi\left\{q_d(x,x')=0\right\}=0$.
\end{center}
Note that assumption \textbf{(A1)} implies that $\pi\otimes\pi\left\{{\pi(x')}\big/{q_d(x,x')}<\infty\right\}=1$.
We denote by $\gamma_u$ the uniform distribution on $\{1,\ldots,D\}$, that is, $\gamma_u(k)=1/D$ for all $k\in\{1,\ldots,D\}$.
We can then deduce a LLN on the pairs $(x_{i,t},K_{i,t})$ produced by the above algorithm:
\begin{prop}\label{lln-dPMC1}
Under \textbf{(A1)}, for any function $h$ in $L^1_{\pi\otimes\gamma_u}$ and for all $t \geq 1$,
$$
\sum_{i=1}^N \bar \omega_{i,t} h(x_{i,t},K_{i,t}) \stackrel{N\rightarrow\infty}{\longrightarrow_\P} \pi\otimes\gamma_u(h).
$$
\end{prop}
\begin{proof}
We proceed by induction wrt $t$. Using Theorem \ref{LGN}, the case $t=1$ is straightforward since this is a direct consequence
of the convergence of the importance sampling algorithm.
Now, let $t>1$ and assume that the LLN holds for $t-1$.
For $h\in L^1_{\pi\otimes\gamma_u}$, to prove that $\sum_{i=1}^N\bar\omega_{i,t} h(x_{i,t},K_{i,t})$ converges in probability
to $\pi\otimes\gamma_u(h)$, we just need to check that
\begin{align*}
& N^{-1}\sum_{i=1}^N \frac{\pi(x_{i,t})}{q_{K_{i,t}}(\tilde x_{i,t-1},x_{i,t})} h(x_{i,t},K_{i,t})
\stackrel{N\rightarrow\infty}{\longrightarrow_\P} \pi\otimes\gamma_u(h)\,, \\
& N^{-1}\sum_{i=1}^N\frac{\pi(x_{i,t})}{q_{K_{i,t}}(\tilde x_{i,t-1},x_{i,t})}
\stackrel{N\rightarrow\infty}{\longrightarrow_\P} 1\,,
\end{align*}
where the second convergence is obviously a special case of the first one (with $h=1$).
For the first convergence, applying Theorem \ref{LGN} with
\begin{align*}
\begin{cases}
U_{N,i}=N^{-1}\frac{\pi(x_{i,t})}{q_{K_{i,t}}(\tilde x_{i,t-1},x_{i,t})} h(x_{i,t},K_{i,t}), \\
\mathcal{G}_N=\sigma\left\{(\tilde x_{i,t-1})_{1\leq i\leq N}, (\alpha^{t,N}_d)_{1 \leq d \leq D}\right\},
\end{cases}
\end{align*}
we only need to check condition (iii). For all $C>0$, we have
\begin{align}
\label{eq:inter2}
& N^{-1}\sum_{i=1}^N \E\left[\left.\frac{\pi(x_{i,t})}{q_{K_{i,t}}(\tilde x_{i,t-1},x_{i,t})} h(x_{i,t},K_{i,t})
\mathbb{I}_{\left\{\frac{\pi(x_{i,t})}{q_{K_{i,t}}(\tilde x_{i,t-1},x_{i,t})} h(x_{i,t},K_{i,t})>C\right\}}\right|\mathcal{G}_N\right]\nonumber \\
& \quad = \sum_{d=1}^D N^{-1}\sum_{i=1}^N F_C(\tilde x_{i,t-1},d)\alpha^{t,N}_d
\end{align}
where $F_C(x,k)=\int \pi(du) h(u,k)\mathbb{I}_{\left\{\frac{\pi(u)}{q_{k}(x,u)}h(u,k)\geq C\right\}}$.
By induction, we have
\begin{align*}
& \alpha^{t,N}_d=\sum_{i=1}^N\bar \omega_{i,t-1}\mathbb{I}_d(K_{i,t-1}){\longrightarrow_\P} 1/D \\
% Ici j'ai change \mu en \pi
& N^{-1}\sum_{i=1}^N F_C(\tilde x_{i,t-1},k) \stackrel{N\rightarrow\infty}{\longrightarrow_\P} \pi(F_C(\cdot,k))
\end{align*}
Using these limits in (\ref{eq:inter2}) yields
$$
N^{-1}\sum_{i=1}^N \E\left[\left.\frac{\pi(x_{i,t})h(x_{i,t},K_{i,t})}{q_{K_{i,t}}(\tilde x_{i,t-1},x_{i,t})}
\mathbb{I}_{\left\{\frac{\pi(x_{i,t}) h(x_{i,t},K_{i,t})}{q_{K_{i,t}}(\tilde x_{i,t-1},x_{i,t})}>C\right\}}\right|\mathcal{G}_N\right]
\stackrel{N\rightarrow\infty}{\longrightarrow_\P} \pi\otimes\gamma_u(F_C).
$$
Since $\pi\otimes\gamma_u(F_C)$ converges to 0 as $C$ goes to infinity, this proves that for all $\eta>0$,
$$
N^{-1}\sum_{i=1}^N \E\left(\left. \frac{\pi(x_{i,t})h(x_{i,t},K_{i,t})}{q_{K_{i,t}}(\tilde x_{i,t-1},x_{i,t})}
\mathbb{I}_{\left\{\frac{\pi(x_{i,t}) h(x_{i,t},K_{i,t})}{q_{K_{i,t}}(\tilde x_{i,t-1},x_{i,t})}>N\eta\right\}}\right|\mathcal{G}_N\right)
\stackrel{N\rightarrow\infty}{\longrightarrow_\P} 0\,.
$$
Condition (iii) is satisfied and Theorem \ref{LGN} applies. The proof follows.
\end{proof}
Note that this convergence result is more than what we need for Monte Carlo purposes since the $K_{i,t}$'s are
auxiliary parameters that are not relevant for the original problem. However, it is
eventually a negative result in that, while it implies that
$$
\sum_{i=1}^N \bar \omega_{i,t} f(x_{i,t})
$$
is a convergent estimator of $\pi(f)$, it also shows that, for $t\ge 1$,
$$
\sum_{i=1}^N \bar \omega_{i,t} {\mathbb I}_{d}(K_{i,t})\stackrel{N\rightarrow\infty}{\longrightarrow_\P} \frac{1}{D}.
$$
Therefore, at {\em each} iteration, the weights of {\em all\/} kernels converge to $1/D$ when the number of points in the sample
grows to infinity. This translates in the lack of learning properties for the $D$-kernel PMC algorithm: its properties at
iteration $1$ and at iteration $10$ are the same. In other words, this algorithm is not adaptive and only requires one iteration
with a large value of $N$. We can note that, when this scheme was used in \cite{Cappe:Guillin:Marin:Robert:2004}, the fast stabilization
of the approximation was noticeable. (It is also possible to establish a CLT for this algorithm, but given its unappealing features,
we leave the details for the interested reader.)
In order to get a truly adaptive PMC scheme, based on the above $D$-kernel algorithm, we have first to set an effective criterion of
adaptivity and approximation of the target distribution by the proposal distribution.
We then derive a modification of the original $D$-kernel algorithm that
achieves efficiency in this sense. As argued in many papers using a wide range of arguments, a natural choice of approximation metric
is the Kullback divergence: we can aim at deriving the $D$-kernel mixture that minimizes the Kullback divergence between this mixture
and the target measure $\pi$
\begin{equation}\label{fistKul}
\iint \log\left( \frac{\pi(x) \pi(x')}{\pi(x) \sum_{d=1}^D \alpha_d q_d(x,x')} \right) (\pi \otimes \pi) (dx,dx')\,.
\end{equation}
The following Section is devoted to the problem of finding an iterative choice of mixing coefficients that
converges to this minimum. The optimal PMC scheme then follows in Section \ref{sec:RBD}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Critere de divergence et atteinte du max!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{The Kullback divergence}\label{sec:Kull}
\subsection{The criterion}
Using the same notations as above, in conjunction with the choice of the weights $\alpha_d$ in the $D$ kernel
mixture, we introduce the simplex of $\mathbb{R}^D$,
$$
\mathscr{S}=\left\{ \alpha=(\alpha_{1}, \ldots, \alpha_{D});\ \forall d \in \{1, \ldots, D\},\
\alpha_{d}\geq 0 \quad \mbox{ and } \quad \sum_{d=1}^D \alpha_d=1\right\}\,
$$
and $\bar \pi= \pi \otimes \pi$.
We then assume that the family of the $D$ kernels satisfies the condition
\begin{center}
({\textbf {A2}})
$\forall i \in \{1, \ldots, D\}$, $\E_{\bar \pi}\left[|\log{q_i(X,X')}|\right]=\iint |\log q_i(x,x')| \bar \pi(dx,dx')< \infty,$
\end{center}
which is automatically satisfied when all $q_j$'s dominate $\pi$ (in the accept-reject sense that $\pi/q_j$ is bounded).
We then derive from the Kullback divergence a function on $\mathscr{S}$, that is, for $\alpha \in \mathscr{S}$,
$$
\Klbck_{\bar \pi}(\alpha)= \iint \bar \pi(dx,dx') \log \left(\sum_{d=1}^D \alpha_d q_d(x,x')\right)
= \E_{\bar \pi}\left[\log \sum_{d=1}^D \alpha_d q_d(X,X')\right]\,.
$$
Note that, due to the strict concavity of the $\log$ function, $\Klbck_{\bar \pi}$
is a strictly concave function on a connected compact set and thus has no local maximum besides the global maximum, denoted
$$
\alpha^{max}= \arg\max_{\alpha \in \mathscr{S}}\,\Klbck_{\bar \pi}(\alpha)\,.
$$
Note also that, since
$$
\int \pi(dx) \log \pi(x)-\Klbck_{\bar \pi}(\alpha)= \E_{\bar \pi}\left(\log
\frac{\pi(X)\pi(X')}{\pi(X)\left\{ \sum_{d=1}^D\alpha_d q_d(X,X')\right\}}\right)\,,
$$
%which is thus the Kullback Leibler divergence between $ \sum_{d=1}^D \alpha_d \pi(dx) Q_d(x,dx')$ and $\bar \pi(dx,dx')$. Thus
$\alpha^{max}$ is the optimal choice for a mixture of transition kernels such that the joint law of $(X_0,X_1)$ when $X_0 \sim \pi$
is the nearest to the product distribution $\bar \pi=\pi \otimes \pi$. We then have the following obvious inequality:
\begin{lemma}
Under \textbf{(A1-A2)}, for all $\alpha \in \mathscr{S}$, $\Klbck_{\bar \pi}(\alpha) \leq \int \pi(dx) \log \pi(x)$.
\end{lemma}
\subsection{A maximization algorithm}
We now propose an iterative procedure, akin to the EM algorithm, that updates the weights so that the function
$\Klbck_{\bar \pi}(\alpha)$ increases at each step.
We first define $F$ as the function on $\mathscr{S}$ such that %: S \to S$ be the function defined by
$$%\begin{equation}
F(\alpha) =\left(\E_{\bar \pi}\left[ \frac{\alpha_d q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right]\right)_{1 \leq d \leq D}
$$%\end{equation}
and construct the sequence on $\mathscr{S}$
\begin{equation}\label{eq:miseJourCoeff}
\begin{cases}
\alpha^{1}=(1/D, \ldots, 1/D) & \\
\alpha^{t+1}=F(\alpha^{t})&\mbox{for}\quad t \geq 1
\end{cases}
\end{equation}
Note that, under assumption \textbf{(A1)}, for all $t \geq 0$, \\
$\E_{\bar \pi}\left( {q_d(X,X')}\bigg/{\sum_{j=1}^D \alpha_j^t q_j(X,X')}\right)>0$ and thus, for all $t \geq 0$ and all $ d \in \{1, \ldots, D \}$,
$\alpha^t_{d}>0$. If we define the extremal set $\Inv_D$ as
\small \begin{equation}\label{def:invariant}
\left \{\alpha \in \mathscr{S}; \forall d \in \{1, \ldots, D\},\ \alpha_d=0 \quad \mbox{or}
\quad \E_{\bar \pi}\left( \frac{q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right)=1\right\} \,,
\end{equation}
\normalsize we then have the following fixed point result:
\begin{prop}
\label{prop:decreasingContrast}
Under \textbf{(A1)} and \textbf{(A2)},
\begin{description}
\item [i)] $\Klbck_{\bar \pi} \circ F -\Klbck_{\bar \pi}$ is continuous,
\item [ii)] For all $\alpha \in \mathscr{S}$, $ \Klbck_{\bar \pi} \circ F(\alpha) \geq \Klbck_{\bar \pi}(\alpha)$,
\item [iii)] $\Inv_D=\{\alpha \in \mathscr{S}; F(\alpha)=\alpha\}=\{\alpha\in \mathscr{S}; \Klbck_{\bar \pi} \circ F(\alpha) =\Klbck_{\bar \pi}(\alpha)\}$ and $\Inv_D$ is finite.
\end{description}
\end{prop}
\begin{proof}
$\Klbck_{\bar \pi}$ is clearly continuous. Moreover, by Lebesgue dominated convergence theorem,
the function $\alpha \mapsto \E_{\bar \pi}\left({\alpha_d q_d(X,X')}\big/{\sum_{j=1}^d \alpha_j q_j(X,X')}\right)$ is also continuous,
which implies that $F$ is continuous. This completes the proof of {\bf i)}.
Now, by the concavity of the $\log$ function,
\begin{align}
&\Klbck_{\bar \pi}(F(\alpha)) - \Klbck_{\bar \pi}(\alpha) \nonumber\\
&\quad =\E_{\bar \pi} \left( \log\left[\sum_{d=1}^D \frac{\alpha_d q_d(X,X')}{\sum_{j=1}^D
\alpha_j q_j(X,X')}\E_{\bar \pi}\left( \frac{q_d(X,X')}{\sum_{j=1}^D
\alpha_j q_j(X,X')}\right)\right]\right) \nonumber \\
&\quad \geq \E_{\bar \pi}\left[ \sum_{d=1}^D \frac{\alpha_d q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\log \E_{\bar \pi}\left( \frac{q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right)\right] \nonumber \\
& \quad = \sum_{d=1}^D \alpha_d \E_{\bar \pi}\left(\frac{q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right) \log \E_{\bar \pi}\left(\frac{q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right) \label{eq:contrast1}
\end{align}
Applying the inequality $u\log u \geq u -1$ to (\ref{eq:contrast1})
yields {\bf ii)}. Moreover, equality in $u\log u \geq u -1$ holds if,
and only if $u=1$. Therefore, equality in (\ref{eq:contrast1}) is equivalent to
$$
\forall \alpha_d \neq 0, \quad \E_{\bar \pi}\left(\frac{q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right)=1.
$$
Thus, $\Inv_D=\{\alpha\in \mathscr{S}; \Klbck_{\bar \pi} \circ
F(\alpha)=\Klbck_{\bar \pi}(\alpha)\}$. The second equality $\Inv_D =\{\alpha \in \mathscr{S}; F(\alpha)=\alpha\}$
is straightforward.
We now prove par recursion on $D$ that $\Inv_D$ is finite. Recalling the definition of
$\Inv_D$, % (see (\ref{def:invariant})),
the recursion is quite straightforward. We just need to prove that the set
$$
\{\alpha \in \Inv_D;\alpha_d \neq 0 \quad \forall d \in \{1, \ldots, D\}\}
$$
is empty or finite. If this set is non-empty, any element $\alpha$ in this set satisfies
$$
\forall d \in \{1, \ldots, D\} \quad \E_{\bar \pi}\left(\frac{q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right)=1\,,
$$
which implies
\begin{multline*}
0=\sum_{d=1}^D \alpha^{max}_d \left(\E_{\bar \pi} \left(\frac{ q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right)-1\right) \\
=\E_{\bar \pi} \left(\frac{\sum_{d=1}^D \alpha^{max}_d q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}-1\right)\geq \E_{\bar \pi} \left( \log \frac{\sum_{d=1}^D \alpha^{max}_d q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right)\geq 0
\end{multline*}
By unicity of the global maximum of $\Klbck_{\bar \pi}$, we conclude that $\alpha=\alpha^{max}$ and hence {\bf iii)}.
\end{proof}
%\begin{rem}
Proposition \ref{prop:decreasingContrast} implies that our recursive procedure satisfies
$\Klbck_{\bar \pi}(\alpha^{t+1}) \geq \Klbck_{\bar \pi}(\alpha^{t})$. Therefore, the Kullback Leibler divergence
criterion \eqref{fistKul} decreases at each step.
This property is closely linked with the EM algorithm \citep[][Section 5.3]{Robert:Casella:2004}. More precisely, consider the mixture model
$$
V \sim {\mathcal M}(1,(\alpha_1, \ldots, \alpha_D))
\quad\text{and}\quad
W=(X,X')|V \sim \pi(dx) Q_V(x,dx')
$$
with parameter $\alpha$. We denote by $\bar \E_{\alpha}$ the corresponding expectation, by $p_\alpha (v,w)$
the joint density of $(V,W)$ wrt $\mu \otimes \mu$ and by $p_\alpha (w)$ the density of $W$ wrt $\mu$.
Then it is easy to check that $\Klbck_{\bar \pi}(\alpha)=\int\log(p_\alpha(w))\bar\pi(dw)$
which is an average version of the criterion to be maximized in the EM algorithm when only $W$ is observed.
In that case, a natural idea adapted from the EM algorithm would be to update $\alpha$ according to the iterative scheme
$$%\begin{align*}
\alpha^{t+1}= \arg \max_{\alpha \in \mathscr{S}} \int\bar\E_{\alpha^t}\left[ \left. \log p_\alpha(V,w) \right|w \right]\bar\pi(dw)\,.
$$%\end{align*}
By direct algebra, this definition of $\alpha^{t+1}$ is equivalent to the update formula $\alpha^{t+1}=F(\alpha^t)$ that we used
above. Our algorithm then appears as an averaged EM, but preserves the deterministic increase of the criterion enjoyed by EM.
%\end{rem}
The following proposition ensures that any $\alpha$ different from $\alpha^{max}$ %$\mathscr{S} \setminus \{\alpha^{max}\}$ is repulsive.
is repulsive. % Bouh, pas beau !
\begin{prop}
\label{prop:repulsive}
Under \textbf{(A1)} and \textbf{(A2)}, for every $\alpha \in \mathscr{S}\setminus \{\alpha^{max}\}$, there exists a neighborhood $V_{\alpha}$
of $\alpha$ such that, if $\alpha^{t_0} \in V_\alpha$, then $(\alpha^t)_{t \geq t_0}$ leaves $V_{\alpha}$ within a finite time.
\end{prop}
\begin{proof}
Let $\alpha \in \mathscr{S}\setminus \{\alpha^{max}\}$. Then, using the inequality $u-1 \geq \log u$,
$$
\sum_{d=1}^D \alpha^{max}_d \E_{\bar \pi} \left(\frac{ q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right)-1 \geq \E_{\bar \pi} \left( \log \frac{\sum_{d=1}^D \alpha^{max}_d q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right)>0
$$
which implies that there exists $d \in \{1, \ldots, D\}$ such that
$$
\E_{\bar \pi} \left(\frac{ q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right)>1.
$$
Let $(W_n)_{n \geq 0}$ be a non increasing sequence of neighborhoods of $\alpha$ in $\mathscr{S}$. We have by
the monotone convergence theorem,
\begin{eqnarray*}
1<\E_{\bar \pi}\left(\frac{ q_d(X,X')}{\sum_{j=1}^D \alpha_j q_j(X,X')}\right)&=& \E_{\bar \pi}\left(\lim_{n \to \infty}\inf_{\beta \in W_n} \frac{ q_d(X,X')}{\sum_{j=1}^D \beta_j q_j(X,X')} \right)\\
&=& \lim_{n \to \infty} \E_{\bar \pi}\left(\inf_{\beta \in W_n} \frac{ q_d(X,X')}{\sum_{j=1}^D \beta_j q_j(X,X')} \right)\\
&\leq&\lim_{n \to \infty}\inf_{\beta \in W_n}\E_{\bar \pi}\left( \frac{ q_d(X,X')}{\sum_{j=1}^D \beta_j q_j(X,X')} \right)
\end{eqnarray*}
Thus, there exist $W_{n_0}=V_\alpha$ a neighborhood of $\alpha$ and
$\eta>1$ such that for all $\beta \in V_\alpha$,
\begin{equation}\label{eq:sup1}
\E_{\bar \pi} \left(\frac{ q_d(X,X')}{\sum_{j=1}^D \beta_j q_j(X,X')}\right)>\eta.
\end{equation}
Now use that for all $t\geq 0$ and $d \in \{1, \ldots, D\}$, $1\geq \alpha^t_d>0$ and combine (\ref{eq:sup1}) with the
update formulas for $\alpha^t_d$ (given by (\ref{eq:miseJourCoeff})). This shows that $(\alpha^t)_{t \geq 0}$
leaves $V_\alpha$ within a finite time.
\end{proof}
We thus conclude that the maximization algorithm is convergent:
\begin{prop} \label{prop:convMax}
Under \textbf{(A1)} and \textbf{(A2)},
$$
\lim_{t \to \infty} \alpha^t=\alpha^{max}\,.
$$
\end{prop}
\begin{proof}
First, note that $\Inv_D$ is a finite set which contains $\alpha^{max}$. Write $\Inv_D=\{\beta_0,\beta_1, \ldots, \beta_I\}$
with $\beta_0=\alpha^{max}$. If we introduce a sequence $(W_i)_{0\leq i \leq I}$ of disjoint neighborhoods of the $\beta_i$'s so that
for all $0\le i\le I$, $F(W_i)$ is disjoint from $\cup_{j \neq i} W_j$ (this is possible since $F(\beta_i)=\beta_i$
and $F$ is continuous) and, for all $i \in \{1,\ldots, I\}$, $W_i \subset V_{\beta_i}$ where the $(V_{\beta_i})$'s
are defined in the proof of Proposition \ref{prop:repulsive}.
The sequence $(\Klbck_{\bar \pi}(\alpha^t))_{t \geq 0}$ is upper-bounded and non decreasing and therefore it converges.
This implies that $\lim_{t \to \infty}\Klbck_{\bar \pi}\circ F(\alpha^t)-\Klbck_{\bar \pi}(\alpha^t)=0$.
By continuity of $\Klbck_{\bar \pi}\circ F-\Klbck_{\bar \pi}$, there exists $T>0$ such that for all $t \geq T$,
$\alpha_t \in \cup_{j} W_j$. Since $F(W_i)$ is disjoint from $\cup_{j \neq i} W_j$, this implies that there exists
$i \in \{0, \ldots, I\}$ such that for all $t \geq T$, $\alpha^t \in W_i$. By Proposition \ref{prop:repulsive}, $i$
cannot be in $\{1,\ldots, I\}$. Thus, for all $t \geq T$, $\alpha^t \in W_0$ which is a neighborhood of
$\beta_0=\alpha^{max}$. The proof is completed.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%Rao-Blackwellized D-kernel PMC
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{The Rao-Blackwellized $D$-kernel PMC}\label{sec:RBD}
The update formula (\ref{eq:miseJourCoeff}) has been shown to improve the Kullback Leibler divergence criterion
at every iteration. We now discuss how to implement this mechanism within a PMC algorithm that resembles the previous
$D$-kernel algorithm. The only difference with the algorithm of Section \ref{sub:alGod} is that we make use of the kernel structure in the
computation of the importance weight: in MCMC terminology, this is called ``Rao-Blackwellization''
\citep[][Section 4.2]{Robert:Casella:2004} and it is known to provide variance reduction in data augmentation settings
\citep[][Section 9.2]{Robert:Casella:2004}. In the current context, the improvement brought by Rao-Blackwellization
is dramatic, in that the modified algorithm does converge to the proposal mixture that is closest to the target distribution in the sense of
the Kullback Leibler divergence. More precisely, a Monte Carlo version of the update formula \eqref{eq:miseJourCoeff} can be implemented
in the iterative definition of the mixture weights, in the same way as MCEM approximates EM \citep[][Section 5.3.3]{Robert:Casella:2004}.
\subsection{The algorithm}\label{sub:alGore}
In importance sampling as well as in MCMC settings, the conditioning improvement brought by Rao-Blackwellization may be significant \citep{Celeux:Marin:Robert:2003}. In the
context of the $D$-kernel PMC scheme, the Rao-Blackwellization argument is that it is not necessary to use the mixture component in the
computation of the importance weight but rather the whole mixture. The importance weight is therefore
$$
{\pi(x_{i,t})}\bigg/ {\sum_{d=1}^D \alpha^{t,N}_d q_d(\tilde x_{i,t-1},x_{i,t})}
\quad\text{rather than}\quad
{\pi(x_{i,t})}\big/ q_{K_{i,t}} (\tilde x_{i,t-1},x_{i,t})
$$
as in the algorithm of Section \ref{sub:alGod}. As already noted by \cite{Hesterberg:1998}, the use of the whole mixture in the
importance weight provides a robust tool for preventing infinite variance importance sampling estimators. In our setup, this choice
of weight will guarantee that the following algorithm converges to the optimal mixture.
\begin{framed}\begin{sf}
\noindent {\bf Rao-Blackwellized $D$-kernel PMC algorithm:}
\noindent At time 0, use the same step as in the generic PMC algorithm to produce the sample $(\tilde x_{i,0},J_{i,0})_{1\leq i\leq N}$
and set $\alpha^{1,N}_d=1/D $ for all $1\le d \le D$.
\noindent At time $t$ ($t=1,\ldots,T$),
\begin{enumerate}\renewcommand{\theenumi}{\alph{enumi})}
\item Conditionally on $\sigma\left\{ (\alpha^{t,N}_d)_{1 \leq d \leq D}\right\}$, generate
$$
(K_{i,t})_{1 \leq i\leq N} \stackrel{\text{iid}}{\sim} \mathcal{M}(1,(\alpha_d^{t,N})_{1\leq d\leq D})
$$
\item Conditionally on $\sigma\left\{(\tilde x_{i,t-1},K_{i,t})_{1\leq i\leq N}\right\}$, generate independent
$$(x_{i,t})_{1\leq i\leq N} \sim Q_{K_{i,t}}(\tilde x_{i,t-1},\cdot)$$
and set $\ds \omega_{i,t}={\pi(x_{i,t})}\bigg/{\sum_{d=1}^D\alpha^{t,N}_d q_d(\tilde x_{i,t-1},x_{i,t})}$;
\item\label{stepC} Conditionally on $\sigma\left\{(\tilde x_{i,t-1},K_{i,t},x_{i,t})_{1\leq i\leq N}\right\}$, generate
$$(J_{i,t})_{1\leq i\leq N}\stackrel{\text{iid}}{\sim} \mathcal{M}(1,(\bar \omega_{i,t})_{1\leq i\leq N})
$$
and set ($1\le i\le N$, $1\leq d\leq D$)
$$
\tilde x_{i,t}=x_{J_{i,t},t}\,,\quad \alpha^{t+1,N}_d=\sum_{i=1}^N\bar \omega_{i,t}\mathbb{I}_d(K_{i,t})
$$
\end{enumerate}
\end{sf}\end{framed}
Note that, once more, the adaptive mechanism is based on the importance weights.
The update of the $\alpha_d$'s in Step \ref{stepC} is the Monte Carlo version of \eqref{eq:miseJourCoeff}
and we now show that this algorithm is converging.
\subsection{The LLN for the Rao-Blackwellized $D$-kernel PMC algorithm}\label{sub:lalaine}
Not very surprisingly, the population of points obtained at each iteration of the Rao-Blackwellized algorithm above
approximates the target distribution in the sense of the weak Law of Large Numbers (LLN). Note that the convergence holds
under the very weak assumption (\textbf{A1}) and for any test function $h$ that is absolutely integrable wrt the target distribution
$\pi$. The function $h$ may thus be unbounded.
\begin{theorem} \label{theo:Lgn}
Under \textbf{(A1)}, for any function $h$ in $L^1_\pi$ and for all $t\geq 0$,
\begin{align}
& \sum_{i=1}^N \bar \omega_{i,t} h(x_{i,t})\stackrel{N\rightarrow\infty}{\longrightarrow_\P}\pi(h) \label{eq:lgn2} \\
& \frac{1}{N} \sum_{i=1}^N h(\tilde x_{i,t})\stackrel{N\rightarrow\infty}{\longrightarrow_\P}\pi(h) \label{eq:lgn1}
\end{align}
\end{theorem}
\begin{proof}
We proceed by induction wrt $t$ on the two limiting results (\ref{eq:lgn1}) and (\ref{eq:lgn2}).
The case $t=0$ is the basic importance sampling
convergence result. Now, let $t \geq 1$ and assume that (\ref{eq:lgn1}) and (\ref{eq:lgn2}) both
hold for $t-1$. We will just show (\ref{eq:lgn2})
since (\ref{eq:lgn1}) is a straightforward consequence of (\ref{eq:lgn2}) and Theorem \ref{LGN} due
to multinomial sampling, by noting that
\begin{equation*}
\E \left(\left. \frac{1}{N} \sum_{i=1}^N h(\tilde x_{i,t}) \right| (x_{i,t})_{1 \leq i\leq N} \right)
=\sum_{i=1}^N \omega_{i,t} h(x_{i,t}).
\end{equation*}
To prove (\ref{eq:lgn2}), we will check that
\begin{align*}
&\frac{1}{N}\sum_{i=1}^N \omega_{i,t} h(x_{i,t})\stackrel{N\rightarrow\infty}{\longrightarrow_\P}\pi(h)\\
&\frac{1}{N}\sum_{i=1}^N \omega_{i,t}\stackrel{N\rightarrow\infty}{\longrightarrow_\P}1.
\end{align*}
The later limit is also a direct consequence of the former with $h=1$. We apply Theorem \ref{LGN} with
${\mathcal G}_N=\sigma\left((\tilde x_{i,t-1})_{1 \leq i \leq N}, (\alpha^{t,N}_d)_{1 \leq d \leq D}\right)$
and $U_{N,i}=N^{-1}\omega_{i,t}h(x_{i,t})$.
Conditionally on ${\mathcal G}_N$, the $(x_{i,t})_{1 \leq i\leq N}$'s are independent and
$$
x_{i,t} |{\mathcal G}_N\sim \sum_{d=1}^D \alpha^{t,N}_d Q_d(\tilde x_{i,t-1}, \cdot)
$$
Noting that
$$
\sum_{i=1}^N \E\left(\left. \frac{\omega_{i,t} h(x_{i,t})}{N}\right| {\mathcal G}_N \right)
= \sum_{i=1}^N \E\left(\left. \frac{\pi(x_{i,t}) h(x_{i,t})}{N \sum_{d=1}^D \alpha^{t,N}_d q_d(\tilde x_{i,t-1},
x_{i,t})}\right| {\mathcal G}_N \right)=\pi(h),
$$
to apply Theorem \ref{LGN}, we only need to check condition (iii). Now, write
\begin{align*}
& \sum_{i=1}^N \E\left(\left. \frac{\omega_{i,t} h(x_{i,t})}{N} \mathbb{I}_{\left\{ \omega_{i,t} h(x_{i,t})>C\right\}}\right| {\mathcal G}_N \right) \\
& \quad =\frac{1}{N} \sum_{i=1}^N \int \pi(dx)h(x) \mathbb{I}_{\left\{ \frac{\pi(x)h(x)}{\sum_{d=1}^D \alpha^{t,N}_d q_d(\tilde x_{i,t-1}, x)}>C\right\}} \\
& \quad \leq \sum_{d=1}^D \frac{1}{N} \sum_{i=1}^N \int \pi(dx)h(x) \mathbb{I}_{\left\{ \frac{\pi(x)h(x)}{D^{-1} q_d(\tilde x_{i,t-1}, x)}>C\right\}}
\end{align*}
Note $F_C(u)=\int \pi(dx)h(x) \mathbb{I}_{\left\{ \frac{\pi(x)h(x)}{D^{-1} q_d(u, x)}>C\right\}}$. We have $F_C(u) \leq \pi(h)$ and thus, by the induction assumption
$$
N^{-1} \sum_{i=1}^N F_C(\tilde x_{i,t-1}) \stackrel{N\rightarrow\infty}{\longrightarrow_\P}\pi(F_C).
$$
The proof is completed since
$$
\pi(F_C)=\iint \pi(dx)\pi(dx') h(x)\mathbb{I}_{\left\{ \frac{\pi(x)h(x)}{D^{-1} q_d(x', x)}>C\right\}}
\stackrel{C \rightarrow \infty}{\longrightarrow} 0\,.
$$
\end{proof}
\subsection{Convergence of the weights}\label{sub:Return_o_Jedi}
The next proposition ensures that, at each iteration of the algorithm, the population of points is
modified according to a mixture of kernels whose weights approximate the ones obtained by the
iterative procedure described in Section \ref{sec:Kull} for minimizing the Kullback divergence criterion.
\begin{prop}\label{trick}
Under \textbf{(A1)}, for all $\ t \geq 1$,
\begin{equation}\label{eq:coeffConv}
\forall 1\le d\le D, \quad \alpha^{t,N}_d\stackrel{N\rightarrow\infty}{\longrightarrow_\P} \alpha^t_d
\end{equation}
where the $\alpha^t_d$'s are defined in (\ref{eq:miseJourCoeff}).
\end{prop}
Combining Proposition \ref{trick} with Proposition \ref{prop:convMax}, we obtain that,
under assumptions \textbf{(A1)} and \textbf{(A2)}, the
Rao--Blackwellized version of the PMC algorithm automatically adapts the weights of the proposed mixture of kernels and converges to
the optimal combination of mixtures wrt to the Kullback divergence criterion defined in Section \ref{sec:Kull}.
\begin{proof}
The case $t=1$ is obvious. Now, assume \eqref{eq:coeffConv} holds
for some $t\geq1$. As in the proof of Proposition \ref{lln-dPMC1}, we now prove that
\begin{align*}
&\frac{1}{N} \sum_{i=1}^N \omega_{i,t} \mathbb{I}_d({ K_{i,t}})=\frac{1}{N}\sum_{i=1}^N\frac{\pi(x_{i,t})}{\sum_{l=1}^D \alpha^{t,N}_lq_l(\tilde x_{i,t-1},x_{i,t})}\mathbb{I}_d({ K_{i,t}})
\stackrel{N\rightarrow\infty}{\longrightarrow_\P}\alpha^{t+1}_d,\\
&\frac{1}{N} \sum_{i=1}^N \omega_{i,t}\stackrel{N\rightarrow\infty}{\longrightarrow_\P}1.
\end{align*}
Only the first convergence needs be considered since the latter can be easily deduced from the former. \\
We apply Theorem \ref{LGN} with ${\mathcal G}_N=\sigma \left((\tilde x_{i,t-1})_{1 \leq i \leq N},
(\alpha^{t,N}_d)_{1 \leq d \leq D}\right)$
and $U_{N,i}=N^{-1}\omega_{i,t}\mathbb{I}_d({ K_{i,t}})$.
Conditionally on ${\mathcal G}_N$, $(K_{i,t},x_{i,t})_{1 \leq i \leq N}$ are independent and for
all $(d,A)$ in $\{1,\ldots, D\} \times {\mathcal A}$,
$$
\P\left(\left. K_{i,t}=d,x_{i,t} \in A \right| {\mathcal G}_N \right)= \alpha^{t,N}_d Q_d(\tilde x_{i,t-1},A)
$$
To apply Theorem \ref{LGN}, we just need to check condition (iii). We have
\begin{align*}
&\E\left(\left. \sum_{i=1}^N \frac{\omega_{i,t} {\mathbb I}_d(K_{i,t})}{N} {\mathbb I}_{\left\{\omega_{i,t} {\mathbb I}_d(K_{i,t})>C\right\}}\right| {\mathcal G}_N\right)\\
& \quad \leq \sum_{j=1}^D \frac{1}{N} \sum_{i=1}^N \int \pi(dx) \frac{\alpha^{t,N}_d q_d(\tilde x_{i,t-1},x)}{\sum_{l=1}^D \alpha^{t,N}_l q_l(\tilde x_{i,t-1},x)} {\mathbb I}_{\left\{ \frac{\pi(x)}{D^{-1} q_j(\tilde x_{i,t-1},x)}>C\right\}}\\
& \quad \leq \sum_{j=1}^D \frac{1}{N} \sum_{i=1}^N \int \pi(dx) {\mathbb I}_{\left\{ \frac{\pi(x)}{D^{-1} q_j(\tilde x_{i,t-1},x)}>C\right\}} \stackrel{N\rightarrow\infty}{\longrightarrow_\P} \sum_{j=1}^D \bar \pi \left( \frac{\pi(x)}{D^{-1} q_j( x',x)}>C\right)
\end{align*}
by the LLN stated in Theorem \ref{theo:Lgn}. The rhs converges to 0 as $C$ grows to infinity since by assumption
\textbf{(A1)}, $\bar \pi \{ q_j(x,x')=0\}=0$. Thus, Theorem \ref{LGN} applies and
$$
\frac{1}{N} \sum_{i=1}^N \omega_{i,t} \mathbb{I}_d({ K_{i,t}})- \E \left( \left. \frac{1}{N} \sum_{i=1}^N \omega_{i,t} \mathbb{I}_d({ K_{i,t}})\right| {\mathcal G}_N\right)\stackrel{N\rightarrow\infty}{\longrightarrow_\P} 0.
$$
To complete the proof, it remains to show that
\small \begin{equation} \label{eq:auxil}
\E \left( \left. \frac{1}{N} \sum_{i=1}^N \omega_{i,t} \mathbb{I}_d({ K_{i,t}})\right| {\mathcal G}_N\right)=\frac{1}{N} \sum_{i=1}^N \int \pi(dx) \frac{\alpha^{t,N}_d q_d(\tilde x_{i,t-1},x)}{\sum_{l=1}^D \alpha^{t,N}_l q_l(\tilde x_{i,t-1},x)} \stackrel{N\rightarrow\infty}{\longrightarrow_\P} \alpha^{t+1}_d
\end{equation}
\normalsize Using again the LLN stated in Theorem \ref{theo:Lgn},
\begin{equation}
\label{eq:auxil2}
\frac{1}{N} \sum_{i=1}^N \int \pi(dx) \frac{\alpha^{t}_d q_d(\tilde x_{i,t-1},x)}{\sum_{l=1}^D \alpha^{t}_l q_l(\tilde x_{i,t-1},x)} \stackrel{N\rightarrow\infty}{\longrightarrow_\P} \E_{\bar \pi}\left(\frac{\alpha^t_d q_d(X,X')}{\sum_{l=1}^D \alpha^{t}_l q_l(X,X')} \right)=\alpha^{t+1}_d
\end{equation}
Thus, to prove (\ref{eq:auxil}), we use (\ref{eq:auxil2}) and check that the difference between both terms converges to $0$
in probability. To see this, first note that for all $t \geq 1$, for all $d$ in $\{1, \ldots, D\}$, $\alpha^t_d>0$ and thus,
by the induction assumption, for all $d$ in $\{1, \ldots, D\}$,
$\frac{\alpha^{t,N}_d-\alpha^t_{d}}{\alpha^t_d} \stackrel{N\rightarrow\infty}{\longrightarrow_\P} 0$. Using that $\left| \frac{A}{B} -\frac{C}{D}\right| \leq \left|\frac{A}{B}\right| \left|\frac{D-B}{D}\right| +\left|\frac{A-C}{C}\right|\left|\frac{C}{D}\right|$,
we have by straightforward algebra,
\begin{align*}
& \left| \frac{\alpha^{t,N}_d q_d(\tilde x_{i,t-1},x)}{\sum_{l=1}^D \alpha^{t,N}_l q_l(\tilde x_{i,t-1},x)}-\frac{\alpha^{t}_d q_d(\tilde x_{i,t-1},x)}{\sum_{l=1}^D \alpha^{t}_l q_l(\tilde x_{i,t-1},x)}\right|\\
&\quad \leq \frac{\alpha^{t,N}_d q_d(\tilde x_{i,t-1},x)}{\sum_{j=1}^D \alpha^{t,N}_l q_j(\tilde x_{i,t-1},x)} \left(\sup_{l \in \{1, \ldots, D\}}\left|\frac{\alpha^{t,N}_l-\alpha^t_{l}}{\alpha^t_l}\right|\right) \\
& \quad \hspace{5cm}+ \left|\frac{\alpha^{t,N}_d-\alpha^t_{d}}{\alpha^t_d}\right|\frac{\alpha^{t}_d q_d(\tilde x_{i,t-1},x)}{\sum_{l=1}^D \alpha^{t}_l q_l(\tilde x_{i,t-1},x)}\\
&\quad \leq 2 \sup_{l \in \{1, \ldots, D\}}\left|\frac{\alpha^{t,N}_l-\alpha^t_{l}}{\alpha^t_l}\right| \,.
\end{align*}
The proof follows from $\frac{\alpha^{t,N}_d-\alpha^t_{d}}{\alpha^t_d} \stackrel{N\rightarrow\infty}{\longrightarrow_\P} 0$.
\end{proof}
\subsection{The CLT for the Rao-Blackwellized $D$-kernel PMC algorithm}\label{sub:celai}
We now state and prove a CLT for the weighed and the unweighted samples when the size of
the population grows to infinity. As noted in the SIR algoritm (see Section \ref{sub:sis}),
the asymptotic variance associated with the unweighted sample is larger than the variance of the weighted sample.
\begin{theorem}\label{theo:CLT}
Under \textbf{(A1)},
\begin{description}
\item [i) ] For all $h$ satisfying $\bar \pi\left( h^2(x') \frac{\pi(x)}{ q_d(x,x')}\right) < \infty$ for some $d \in \{1, \ldots, D\}$,
we have
\begin{equation}\label{eq:CLTpoids}
\sqrt{N}\sum_{i=1}^N \bar \omega_{i,t}\left\{ h(x_{i,t})-\pi(h) \right\} \stackrel{\cal L}{\longrightarrow}{\cal N}(0,\sigma_t^2),
\end{equation}
with $\sigma^2_t=\bar \pi\left((h(x')-\pi(h))^2\frac{\pi(x')}{\sum_{d=1}^D\alpha^t_dq_d(x,x')}\right)$.
\item [ii) ] If moreover $\pi(h^2) < \infty$, then
\begin{equation}\label{eq:CLTtilde}
\frac{1}{\sqrt{N}}\sum_{i=1}^N (h(\tilde x_{i,t})-\pi(h))\stackrel{\cal L}{\longrightarrow}{\cal N}(0,\sigma_t^2+ {\mathbb V}_{\pi}(h))
\end{equation}
\end{description}
\end{theorem}
Note that amongst the conditions under which this theorem applies, the integrability condition
\begin{equation}
\label{eq:condIntegCLT}
\bar \pi\left( h^2(x') \frac{\pi(x)}{ q_d(x,x')}\right) < \infty
\end{equation}
is required for {\em some} $d$ in $\{1, \ldots, D\}$ and not for {\em all} $d$.
Thus, situations where some transition kernels $q_d(\cdot, \cdot)$
do not satisfy (\ref{eq:condIntegCLT}) can still be covered by this theorem provided that (\ref{eq:condIntegCLT}) holds for at least
one particular kernel. An equivalent expression of the asymptotic variance $\sigma^2_t$ is
$$
\sigma^2_t=\mathbb{V}_\nu \left( (h-\pi(h))\frac{\bar \pi}{\nu}\right) \ \mbox{where} \ \nu(dx,dx')=\pi(dx)\left(\sum_{d=1}^D \alpha_d^t Q_d(x,dx')\right).
$$
Written as above, $\sigma^2_t$ turns to have the same expression as the asymptotic variance that appears in the CLT
associated to the self-normalized IS algorithm (SNIS) (see Section \ref{sub:pmc2} for a
description of the algorithm and the associated CLT)
where the proposal distribution is $\nu$
and the target distribution $\bar \pi$. Still, the SNIS algorithm can not be implemented here since by the above
definition of $\nu$, the proposal distribution depends on both $\pi$ and the weights $(\alpha_d^t)$ which are unknown.
\begin{proof}
Without loss of generality, we may assume that $\pi(h)=0$. Let $d_0 \in \{1, \ldots, D\}$ such that
$\bar \pi\left( h^2(x') \frac{\pi(x)}{ q_{d_0}(x,x')}\right) < \infty$. In the proof of Theorem \ref{theo:Lgn},
it has been shown that $\frac{1}{N}\sum_{i=1}^N \omega_{i,t}\stackrel{N\rightarrow\infty}{\longrightarrow_\P}1$
and thus, we only need to prove that
\begin{equation}\label{eq:cltauxi1}
\frac{1}{\sqrt{N}}\sum_{i=1}^N \omega_{i,t} h(x_{i,t})\stackrel{\cal L}{\longrightarrow}{\cal N}(0,\sigma_t^2)
\end{equation}
We will apply Theorem \ref{TCL} with
\begin{align*}
& U_{N,i}= \frac{1}{\sqrt{N}}\omega_{i,t} h(x_{i,t})=\frac{1}{\sqrt{N}}\frac{\pi(x_{i,t})h(x_{i,t})}{\sum_{d=1}^D \alpha^{t,N}_d q_d(\tilde x_{i,t-1}, x_{i,t})},\\
& {\mathcal G}_N=\sigma\left\{(\tilde x_{i,t-1})_{1 \leq i \leq N}, (\alpha^{t,N}_d)_{1 \leq d \leq D})\right\}.
\end{align*}
Conditionally on ${\mathcal G}_N$, the $(x_{i,t})_{1 \leq i\leq N}$ are independent and
$$
x_{i,t}|{\mathcal G}_N\sim \sum_{d=1}^D \alpha^{t,N}_d Q_d(\tilde x_{i,t-1}, \cdot).
$$
Conditions (i) and (ii) of Theorem \ref{TCL} are straightforwardly satisfied. To check condition (iii), first note that $\E\left( \left.U_{N,i} \right| {\mathcal G}_N\right)=\pi(h)=0$. Moreover,
$$
A_N= \sum_{i=1}^N\E\left( \left.U_{N,i}^2 \right| {\mathcal G}_N\right) = \frac{1}{N}\sum_{i=1}^N \int \pi(dx) h^2(x)\frac{\pi(x)}{\sum_{d=1}^D \alpha^{t,N}_d q_d(\tilde x_{i,t-1},x)}
$$
By the LLN for $(\tilde x_{i,t})$ stated in Theorem \ref{theo:Lgn}, we have
$$
B_N=\frac{1}{N}\sum_{i=1}^N \int \pi(dx) h^2(x)\frac{\pi(x)}{\sum_{d=1}^D \alpha^{t}_d q_d(\tilde x_{i,t-1},x)}
\stackrel{N\rightarrow\infty}{\longrightarrow_\P}\sigma_t^2
$$
To prove that condition (iii) holds, it is thus sufficient to show that $|B_N-A_N|\stackrel{N\rightarrow\infty}
{\longrightarrow_\P}0$. Since $\alpha^{t,N}_{d_0}\stackrel{N\rightarrow\infty}{\longrightarrow_\P}\alpha^t_{d_0}>0$,
it is sufficient to consider the bound %quantity ${\mathbb I}_{\{\alpha^{t,N}_{d_0}>2^{-1}\alpha^t_{d_0}\}}|B_N-A_N|$,
\begin{align*}
& {\mathbb I}_{\{\alpha^{t,N}_{d_0}>2^{-1}\alpha^t_{d_0}\}}|B_N-A_N| \\
&\quad \leq {\mathbb I}_{\{\alpha^{t,N}_{d_0}>2^{-1}\alpha^t_{d_0}\}}\sup_{1 \leq d \leq D}\left( \frac{\alpha^t_d-\alpha^{t,N}_d}{\alpha^{t}_d}\right) \frac{1}{N}\sum_{j=1}^N\int \pi(dx)\frac{ h^2(x)\pi(x)}{\sum_{d=1}^D \alpha^{t,N}_d q_d(\tilde x_{i,t-1},x)}\\
&\quad \leq \left(\sup_{1 \leq d \leq D}\left( \frac{\alpha^t_d-\alpha^{t,N}_d}{\alpha^{t}_d}\right) \frac{1}{N}\sum_{j=1}^N\int \pi(dx)\frac{ h^2(x) \pi(x)}{2^{-1}\alpha^{t}_{d_0} q_{d_0}(\tilde x_{i,t-1},x)}\right)\stackrel{N\rightarrow\infty}{\longrightarrow_\P}0
\end{align*}
Thus, condition (iii) is satisfied. Now, consider condition (iv). Using the same argument as in condition (iii), we consider
\begin{align*}
&{\mathbb I}_{\{\alpha^{t,N}_{d_0}>2^{-1}\alpha^t_{d_0}\}} \sum_{i=1}^N \E\left( \left. \frac{1}{N}\omega_{i,t}^2 h^2(x_{i,t}){\mathbb I}_ {\left\{ \frac{\pi(x_{i,t})h(x_{i,t})}{\sum_{d=1}^D \alpha^{t,N}_d q_d(\tilde x_{i,t-1}, x_{i,t})}>C\right\}}\right|{\mathcal G}_N \right)\\
&\quad \leq \frac{1}{N} \sum_{i=1}^N \int \pi(dx)h^2(x) \frac{\pi(x)}{2^{-1}\alpha^{t}_{d_0} q_{d_0}(\tilde x_{i,t-1},x)} {\mathbb I}_ {\left\{ \frac{\pi(x)h(x)}{2^{-1}\alpha^{t}_{d_0} q_{d_0}(\tilde x_{i,t-1}, x)}>C\right\}}\\
&\quad \stackrel{N\rightarrow\infty}{\longrightarrow_\P} \bar \pi\left( h^2(x) \frac{\pi(x)}{2^{-1}\alpha^{t}_{d_0} q_{d_0}( x',x)} {\mathbb I}_ {\left\{ \frac{\pi(x)h(x)}{2^{-1}\alpha^{t}_{d_0} q_{d_0}(x', x)}>C\right\}}\right)
\end{align*}
which converges to 0 as $C$ grows to infinity. Thus, Theorem \ref{TCL} applies and the proof of (\ref{eq:CLTpoids}) is completed.
The proof of (\ref{eq:CLTtilde}) is derived as a direct application of Theorem \ref{TCL} as in the SIR result by setting
$U_{N,i}=\frac{1}{\sqrt{N}}h(\tilde x_{i,t})$ and ${\mathcal G_N}=\sigma\left( (x_{i,t})_{1 \leq i\leq N}, (\omega_{i,t})_{1 \leq i \leq N}\right)$.
\end{proof}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Simulation study for toy example
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Illustrations}\label{sec:zoli}
In this section, we briefly show how the iterations of the PMC algorithm quickly implement adaptivity
towards the most efficient mixture of kernels though three examples of moderate difficulty. (The {\sf
R} programs are available on the authors' websites.)
\begin{ex}\begin{em} As a first toy example, consider the case of the target $\pi$ being
the density of a normal mixture
\begin{equation}\label{eq:mixI}
%\pi(x)=\sum_{i=1}^3 \frac{1}{3}\, f_{\mathscr{N}(\mu_i,\sigma^2_i)}(x)
\sum_{i=1}^3 \frac{1}{3}\, \mathscr{N}(\mu_i,\sigma^2_i)
\end{equation}
and of a independent normal mixture proposal with the same means and variances as in \eqref{eq:mixI} but
started with different weights $\alpha_d^{0,N}$. Note that this is a very special case of $D$ kernel PMC scheme
in that the Markov kernels of Section \ref{sub:alGore} are then independent proposals.
In this case, the optimal choice of weights is obviously $\alpha_d^{\star} = 1/3$.
The corresponding {\sf R} functions are
<<>>=
pmc3norind2=function(nsimu=10000,niter=25,var1=1/3,var2=2/3,var3=1,m1=-1,m2=1,m3=2)
{
v=c(var1,var2,var3)
m=c(m1,m2,m3)
pp=matrix(0,niter,3)
pp[1,]=c(0.05,0.05,0.9)
for (i in 1:niter)
{
if (i!=1)
{
pp[i,1]=sum(w*(sigma==v[1]))
pp[i,2]=sum(w*(sigma==v[2]))
pp[i,3]=sum(w*(sigma==v[3]))
}
K=sample(1:3,nsimu,prob=pp[i,],rep=T)
sigma=v[K]
mu=m[K]
xx=rnorm(nsimu,mu,sqrt(sigma))
lw=log(1/3*dnorm(xx,m[1],sqrt(v[1]))+1/3*dnorm(xx,m[2],sqrt(v[2]))+
1/3*dnorm(xx,m[3],sqrt(v[3])))-log(pp[i,1]*dnorm(xx,m[1],sqrt(v[1]))+
pp[i,2]*dnorm(xx,m[2],sqrt(v[2]))+pp[i,3]*dnorm(xx,m[3],sqrt(1)))
w=exp(lw-max(lw))
w=w/sum(w)
}
list(p=pp,x=xx,w=w)
}
@
The starting value for the $\alpha_i^t$'s are those indicated in {\sf pmc3norind2}, that is, $(0.05,0.05,0.9)$.
As shown by the execution of {\sf pmc3norind2()}, represented in
\begin{figure}[hbtp]
\begin{center}
<>=
res=pmc3norind2()
plot(res$p[,1],ylim=c(0,1),type="l",col="sienna3",lwd=2,ylab="p")
lines(res$p[,1]+res$p[,2],col="sienna3",lw=2)
lines(rep(.333,length(res$p[,1])),col="steelblue2",lty=2)
lines(rep(.666,length(res$p[,1])),col="steelblue2",lty=2)
@
\end{center}
\caption{\label{fig:toystory0}
Convergence of the cumulated weights $\alpha_1^{t,N}$ and $\alpha_1^{t,N}+\alpha_2^{t,N}$
for the three component normal mixture to the optimal values $1/3$ and $2/3$ (represented by
dotted lines). At each iteration, $N=10,000$ points were simulated from the $D$-kernel proposal.}
\end{figure}\end{em}\end{ex}
\begin{ex}\begin{em} As a second toy example, consider the case of a $\mathscr{N}(0,1)$ target and of
the following mixture of $D$ kernels
\begin{equation}\label{eq:conono}
\alpha^{t,N}_1 {\mathscr{T}_2(\tilde x_{i,t-1},1)} + %(x_{i,t}) +
\alpha^{t,N}_2 {\mathscr{N}(\tilde x_{i,t-1},\sigma_2^2)} + %(x_{i,t}) +
\alpha^{t,N}_3 {\mathscr{N}(\tilde x_{i,t-1},\sigma_3^2)}\,, %(x_{i,t}) \,,
\end{equation}
where $\sigma_2^2=4$ and $\sigma_3^2=1/4$. (The first proposal in the mixture is thus a Student $\mathscr{T}_2$
distribution centered at the current value $\tilde x_{i,t-1}$.) The corresponding {\sf R} code is
<<>>=
pmc3nor=function(nsimu=5000,niter=25,var1=18,var2=4,var3=.25,pp1=0.33,pp2=0.33,pp3=0.33)
{
v=c(var1,var2,var3)
pp=matrix(0,niter,3)
pp[1,]=c(pp1,pp2,pp3)/(pp1+pp2+pp3)
xx=rt(nsimu,df=10)
lw=dnorm(xx,log=T)-dt(xx,df=10,log=T)
w=exp(lw-max(lw))
w=w/sum(w)
J=sample(1:nsimu,nsimu,prob=w,rep=T)
xxtilde=xx[J]
for (i in 1:niter){
if (i!=1){
pp[i,1]=sum(sigmatilde==var1)/nsimu
pp[i,2]=sum(sigmatilde==var2)/nsimu
pp[i,3]=sum(sigmatilde==var3)/nsimu
}
K=sample(1:3,nsimu,prob=pp[i,],rep=T)
sigma=v[K]
T=sum(K>1)
xx[K==1]=rt(nsimu-T,df=2)+xxtilde[K==1]
xx[K>1]=rnorm(T,mean=xxtilde[K>1],sd=sqrt(sigma[K>1]))
lw=dnorm(xx,log=T)-log(pp[i,1]*dt((xx-xxtilde),df=2)+
pp[i,2]*dnorm(xx,mean=xxtilde,sd=sqrt(var2))+
pp[i,3]*dnorm(xx,mean=xxtilde,sd=sqrt(var3)))
w=exp(lw-max(lw))
w=w/sum(w)
J=sample(1:nsimu,nsimu,prob=w,rep=T)
xxtilde=xx[J]
sigmatilde=sigma[J]
}
list(p=pp,x=xx,w=w)
}
plotur=function(T=25000){
#Internal 3 component mixture
xx=rnorm(T)
dif=rnorm(T,sd=sqrt(2))#rnorm(T)-xx
f1=dt(dif,df=2.0)
f2=dnorm(dif,sd=2)
f3=dnorm(dif,sd=.5)
d1=f1-f3
d2=f2-f3
D=50
p=seq(.001,.999,length=D)
q=seq(.001,.999,length=D)
int=matrix(0,ncol=D,nrow=D)
ent=mean(log(dnorm(xx)))
for (i in 1:D){
b1=p[i]*d1 + f3
for (j in (1:D)[q<1-p[i]])
int[i,j]=mean(log( b1+q[j]*d2 ))
}
int[int!=0]=ent-int[int!=0]
mi=max(int)
int[int==0]=mi
image(p,q,int,xlab=expression(alpha[1]),ylab=expression(alpha[2]))
contour(p,q,int,nlevels=300,add=T)
}
@
Figure \ref{fig:toystory} details the
convergence of the weights to the optimal values for several starting values. (Note that the optimal
values can be approximated numerically by a discretization
of the simplex in $\mathbb{R}^3$. For the discretization step adopted in Figure \ref{fig:toystory}, the
optimum corresponds to $\alpha_1^{\star} = 0.41$ and $\alpha_2^{\star} = 0.51$.) While the sequences of
weights $(\alpha_1^{t,N},\alpha_2^{t,N})$ do not always converge exactly to the same value, this is due
to the considerable flatness of the Kullback--Leibler divergence in this region.
\begin{figure}[hbtp]
\begin{center}
<>=
plotur()
res=pmc3nor(niter=250,pp1=.2,pp2=.25,pp3=.55)
lines(res$p[,1:2],pch=19,cex=.2,col="blue4")
res=pmc3nor(niter=250,pp1=.6,pp2=.05,pp3=.35)
lines(res$p[,1:2],pch=19,cex=.2,col="yellow")
res=pmc3nor(niter=250,pp1=.05,pp2=.05,pp3=.9)
lines(res$p[,1:2],pch=19,cex=.2,col="white")
res=pmc3nor(niter=250,pp1=.49,pp2=.49,pp3=.02)
lines(res$p[,1:2],pch=19,cex=.2,col="green2")
@
\end{center}
\caption{\label{fig:toystory}
Numerical approximation of the Kullback--Leibler divergence for the three component
mixture proposal \eqref{eq:conono} in the simplex of $\mathbb{R}^3$. (The discretization
step is $1/75$ in both $\alpha_1$ and $\alpha_2$ directions.) Superposition of the path
of four calls to the $D$-kernel PMC algorithm when started from different values
$(\alpha_1^{1,N},\alpha_2^{1,N})$. The number $T$ of iterations is between $150$ and $500$ depending
on the starting values, while the sample size is $50,000$.
}
\end{figure}
\end{em}\end{ex}
\begin{table}[hbtp]\begin{center}
\begin{tabular}{c|cc|c}
&0 &1 &Total\\
\hline
0 &60 &364 &424\\
1 &36 &240 &276\\
\hline
Total &96 &604 &700\\
\end{tabular}\end{center}
\caption{\label{tab:loid}
Two-by-two contingency table.
}
\end{table}
\begin{ex}\begin{em} Our third example is a contingency table inspired from \cite{Agresti:2002}, given in
Table \ref{tab:loid}. We model this dataset by a Poisson regression,
$$
x_{ij}\sim\mathscr{P}\left(\exp\left(\alpha_i+\beta_j\right)\right)\qquad (i,j=0,1) \,,
$$
with $\alpha_0=0$ for identifiability reasons. We use a flat prior on the parameter $\theta=(\alpha_1,
\beta_0,\beta_1)$ and run the PMC $D$-kernel algorithm with a mixture of $10$ normal random walk proposals,
$\mathscr{N}(\tilde \theta_{i,t-1},\varrho_d I(\hat\theta))$ $(d=1,\ldots,10)$, where $I(\hat\theta))$ is the Fisher information
matrix evaluated at the MLE, $\hat\theta=(-0.43,4.06,5.9)$ and where the scales $\varrho_d$ vary from $1.35e-19$ to
$1.54e+07$ (the $\varrho_d$'s are equidistributed on a logarithmic scale).
%------------------------R code begins-----------------------------------------------------
<<>>=
like=function(par,ob){
# log likelihood function
ob[1,1]*par[2]-exp(par[2])+
ob[1,2]*par[3]-exp(par[3])+
ob[2,1]*(par[1]+par[2])-exp(par[1]+par[2])+
ob[2,2]*(par[1]+par[3])-exp(par[1]+par[3])
}
pmc4=function(nsimu=50000,niter=5,nvar=10,ranJ=c(-10,10),pp0=(rep(1,nvar)/nvar),grafison=T)
# nsimu is the number of IS particles
# niter is the number of PMC iterations
# nvar is the number of kernels
# ranJ is the range of the variances
# pp0 is the vector of weights
# grafison stands for graph is `on'
{
# Rash model
obs=matrix(c(60,364,36,240),ncol=2,byrow=T)
v=sum(obs)*exp(seq(ranJ[1],ranJ[2],length=nvar)) #choice of variances
pp=matrix(pp0,nrow=niter+1,ncol=nvar,byrow=T)
#Fish info
library(mgcv)
library(MASS)
mle=c(-0.43,4.06,5.9)
Ii=matrix(c(403,37.8,238,37.8,96,0,238,0,604),ncol=3)
SqI=ginv(mroot(Ii))
# Sample of parameters
pareto=matrix(rnorm(3*nsimu,sd=.001),ncol=3)
parms=matrix(mle,ncol=3,nrow=nsimu,byrow=T)+1000*t(SqI%*%t(pareto))
lw=apply(parms,1,like,ob=obs)-apply(dnorm(pareto,log=T,sd=.001),1,sum)
# Importance weights
w=exp(lw-max(lw))
w=w/sum(w)
partilde=parms[sample(1:nsimu,nsimu,prob=w,rep=T),]
if (grafison){
postscript(file="iter0.eps")
nems=c(expression(alpha[1]),expression(beta[0]),expression(beta[1]))
par(mfrow=c(2,2),mar=c(4,2,4,2))
for (j in 1:3)
hist(partilde[,j],nclass=135,col="gold",main=nems[j],xlab="")
hist(apply(partilde,1,like,ob=obs),col="wheat",main="likelihood",xlab="")
par(new=T)
plot(sort(w),type="l",col="sienna4",axes=F,xlab="",ylab="",lwd=2)
dev.off()
}
if (niter>0){
for (i in 1:niter){
K=sample(1:nvar,nsimu,prob=pp[i,],rep=T)
sigma=v[K]
# normals
dife=matrix(rnorm(3*nsimu),ncol=3)
parms=partilde+sqrt(sigma)*t(SqI%*%t(dife))
dife=sigma*dife^2
# Rao Blackwellisation
rbl=pp[i,1]*exp(-0.5*apply(dife,1,sum)/v[1])/v[1]^(1.5)
for (t in 2:nvar)
rbl=rbl+pp[i,t]*exp(-0.5*apply(dife,1,sum)/v[t])/v[t]^(1.5)
lw=apply(parms,1,like,ob=obs)-log(rbl)
w=exp(lw-max(lw))
w=w/sum(w)
partilde=parms[sample(1:nsimu,nsimu,prob=w,rep=T),]
#Graphics
if (grafison){
file=paste("iter",i,".eps",sep="")
postscript(file=file)
par(mfrow=c(2,2),mar=c(4,2,4,2))
for (j in 1:3)
hist(partilde[,j],nclass=135,col="gold",main=nems[j],xlab="")
hist(apply(partilde,1,like,ob=obs),col="wheat",main="likelihood & weights",xlab="")
par(new=T)
plot(sort(w),type="l",col="sienna4",axes=F,xlab="",ylab="",lwd=2)
dev.off()
}
# More R&B !
for (j in 1:nvar)
pp[(i+1),j]=sum(w[K==j])
pp[(i+1),] = (pp[(i+1),]) #+.0001)/1.001
pp[(i+1),]
}
}
list(p=pp,sig=v,is=w,llike=apply(partilde,1,like,ob=obs),par=parms)
}
exploit=function(res){
# Graphical representation of the pmc4 output
# postscript(file="sumacon.eps")
par(mfrow=c(3,3),mar=c(4,2,4,2))
#subsample resampled
mle=c(-0.43,4.06,5.9)
selecto=sample(1:length(res$is),5000,prob=res$is,rep=T)
pur=res$par[selecto,]
# Rash model
obs=matrix(c(60,364,36,240),ncol=2,byrow=T)
hist(pur[,1],nclass=250,col="gold3",xlab=expression(alpha[1]),main="",ylab="",proba=T)
hist(pur[,2],nclass=250,col="gold3",xlab=expression(beta[0]),main="",ylab="",proba=T)
hist(pur[,3],nclass=250,col="gold3",xlab=expression(beta[1]),main="",ylab="",proba=T)
#likelihood sliz
alpha1=seq(min(pur[,1]),max(pur[,1]),length=75)
beta0=seq(min(pur[,2]),max(pur[,2]),length=75)
beta1=seq(min(pur[,3]),max(pur[,3]),length=75)
sliz1=alpha1%*%t(beta0)
par1=matrix(0,ncol=3,nrow=75)
par1[,1]=alpha1
par1[,3]=mle[3]
for (i in 1:75){
par1[,2]=beta0[i]
sliz1[,i]=apply(par1,1,like,ob=obs)
}
image(alpha1,beta0,sliz1,xlab=expression(alpha[1]),ylab=expression(beta[0]),col = heat.colors(123))
points(pur[,1],pur[,2],cex=.3,pch=19)
sliz2=sliz1
par2=par1
par2[,1]=alpha1
par2[,2]=mle[2]
for (i in 1:75){
par2[,3]=beta1[i]
sliz2[,i]=apply(par2,1,like,ob=obs)
}
image(alpha1,beta1,sliz2,xlab=expression(alpha[1]),ylab=expression(beta[1]),col = heat.colors(123))
points(pur[,1],pur[,3],cex=.3,pch=19)
sliz3=sliz1
par3=par1
par3[,1]=mle[1]
par3[,2]=beta0
for (i in 1:75){
par3[,3]=beta1[i]
sliz3[,i]=apply(par3,1,like,ob=obs)
}
image(beta0,beta1,sliz3,xlab=expression(beta[0]),ylab=expression(beta[1]),col = heat.colors(123))
points(pur[,2],pur[,3],cex=.3,pch=19)
#likelihood dots
cala=heat.colors(123)[round(123*(max(res$llike[selecto])-res$llike[selecto])/diff(range(res$llike[selecto])))]
plot(pur[1,1],pur[1,2],cex=.3,pch=19,col=cala[1],xlim=range(alpha1),ylim=range(beta0),
xlab=expression(alpha[1]),ylab=expression(beta[0]))
for (i in 2:5000)
points(pur[i,1],pur[i,2],cex=.3,pch=19,col=cala[i])
plot(pur[1,1],pur[132],cex=.3,pch=19,col=cala[1],xlim=range(alpha1),ylim=range(beta1),
xlab=expression(alpha[1]),ylab=expression(beta[1]))
for (i in 2:5000)
points(pur[i,1],pur[i,3],cex=.3,pch=19,col=cala[i])
plot(pur[1,2],pur[1,3],cex=.3,pch=19,col=cala[1],xlim=range(beta0),ylim=range(beta1),
xlab=expression(beta[0]),ylab=expression(beta[1]))
for (i in 2:5000)
points(pur[i,2],pur[i,3],cex=.3,pch=19,col=cala[i])
#dev.off()
}
@
%------------------------R code ends-----------------------------------------------------
<>=
res=pmc4()
@
The result of $5$ (successive) iterations of the Rao-Blackwell
$D$-kernel algorithm is as follows: unsurprisingly, the largest variance kernels are hardly ever sampled but fulfill
their main role of variance stabilizers in the importance sampling weights while the mixture concentrates on the medium variances,
with a quick convergence of the mixture weights to the limiting weights. This convergence is illustrated in Figure \ref{fig:conpe}
for the cumulated weights of the $5$th, $6$th, $7$th and $8$th components of the mixture, which converge to $0$, $0.003$, $0.259$
and $0.738$, respectively. The adequation of the simulated sample with the target distribution is shown in Figure \ref{fig:concon},
since the points of the sample do coincide with the (unique) modal region of the posterior distribution. The last row of Figure \ref{fig:concon}
(see also the log-posterior histograms in Figure \ref{fig:un'mo'})
shows in addition that there is no degeneracy in the produced samples: most points in the last sample have very similar posterior
values. For instance, $20$\% of the sample corresponds to $95$\% of the weights, while $1$\% of the sample corresponds to $31$\%
of the weights. A closer look at convergence is provided by Figure \ref{fig:un'mo'} where the histograms of the resampled samples
are represented, along with the distribution of the loglikelihood and the empirical cdf of the importance weights: they do not signal
any degeneracy phenomenon but on the opposite a clear stabilization around the values of interest.
\begin{figure}[hbtp]
\begin{center}
<>=
exploit(res)
@
\end{center}
\caption{\label{fig:concon}
Repartition of $5,000$ resampled points after $5$ iterations of the Rao-Blackwellized $D$-kernel PMC sampler
for the contingency table example: {\em first row:} histograms
of the components $\alpha_1$, $\beta_0$ and $\beta_1$, {\em second row:} scatterplot of
the points $(\alpha_1,\beta_0)$, $(\alpha_1,\beta_1)$, and $(\beta_0,\beta_1)$,
on the profile slices of the loglikelihood, {\em third row:} scatterplot of
the same points with a color representation of the corresponding likelihoods: the darker hues
are for higher likelihoods and the range is set by the original distribution of the likelihood
(represented in Figure \ref{fig:un'mo'} for the first four iterations).}
\end{figure}
\begin{figure}[hbtp]
\begin{center}
<>=
plot(res$p[,2],type="l",lwd=2,xlab="t",ylab="",ylim=c(0,1),col="sienna2")
for (i in 3:6)
lines(apply(res$p[,2:i],1,sum),lwd=2,col="sienna2")
@
\end{center}
\caption{\label{fig:conpe}
Convergence of the cumulated weights of the $3$rd,
$4$th, $5$th and $6$th components of the mixture for the
contingency table example.}
\end{figure}
\begin{figure}[hbtp]
\centerline{\includegraphics[height=7cm,angle=270]{iter1.eps}
\includegraphics[height=7cm,angle=270]{iter2.eps}}
\centerline{\includegraphics[height=7cm,angle=270]{iter3.eps}
\includegraphics[height=7cm,angle=270]{iter4.eps}
}
\caption{\label{fig:un'mo'}
Evolution of the samples over $4$ iterations of the Rao-Blackwellized $D$-kernel PMC sampler
for the contingency table example {\em (the output from each iteration is a vignette of four
graph, to be read from left to right and from top to bottom)}:
histograms of the resampled samples of $\alpha_1$, $\beta_0$ and $\beta_1$
of size $50,000$ and {\em (lower right of each vignette)} loglikelihood and the empirical cdf of the importance weights.
}
\end{figure}
\end{em}\end{ex}
\section*{Acknowledgments}
The authors are grateful to Olivier Capp\'e, Paul Fearnhead and Eric Moulines for helpful comments and
discussions. This work was partially supported by an ACI {\em ``Nouvelles Interfaces des Math\'ematiques"}
grant from the Minist\`ere de la Recherche.
\nocite*
\bibliography{dgmr04}%,addX}
\appendix
\section{Convergence theorem for triangular arrays of random variables} \label{sec:theolimtriang}
In this section, we recall some convergence results for triangular
arrays of random variables (see \citealp{Cappe:Moulines:Ryden:2005} or \citealp{Douc:Moulines:2005} for more
details, including the proofs). We will use these results to study the
asymptotic behavior of the PMC algorithm. In the following, let $\{U_{N,i}\}_{N\geq 1,1\leq i\leq N}$ be a
triangular array of random variables defined on the same measurable
space $(\Omega,\mathcal{A})$, let $\{\mathcal{G}_N\}_{N\geq 1}$ be
a sequence of $\sigma$-algebras
included in $\mathcal{A}$, the symbol $X_N \longrightarrow_\P a$ means
$X_N$ converges in probability to $a$ as $N$ goes to infinity.
The definitions and theorems we need in the above proofs are given below.
\begin{definition}
We say that $\{U_{N,i}\}_{N\geq 1,1\leq i\leq N}$ is independent given $\{\mathcal{G}_N\}_{N\geq 1}$ if,
$\forall N\geq 1$, the random variables $U_{N,1},\ldots,U_{N,N}$ are
independent given $\mathcal{G}_N$.
\end{definition}
\begin{definition}
A sequence of random variables $\{Z_N\}_{N\geq 1}$ is said to be
bounded in probability if
$$
\lim_{C\rightarrow\infty}\sup_{N\geq 1}\P\left[|Z_N|\geq C\right]=0.
$$
\end{definition}
%\begin{theorem}
%If
%\begin{itemize}
%\item[(i)] $\{U_{N,i}\}_{N\geq 1,1\leq i\leq N}$ is independent given $\{\mathcal{G}_N\}_{N\geq 1}$;
%\item[(ii)] $\ds
%\forall N\geq 1,\forall i\in\{1,\ldots,N\},\quad \E[|U_{N,i}||\mathcal{G}_N]<\infty\quad\mbox{and}\quad \E[U_{N,i}|\mathcal{G}_N]=0\quad;$
%\item[(iii)] $\exists\epsilon>0$ such that
%$$
%\sum_{i=1}^N\E[U_{N,i}^2\mathbb{I}_{|U_{N,i}|\leq\epsilon}|\mathcal{G}_N]\longrightarrow_\P 0\quad ;\qquad
%\sum_{i=1}^N\E[|U_{N,i}|\mathbb{I}_{|U_{N,i}|>\epsilon}|\mathcal{G}_N]\longrightarrow_\P 0\quad ;
%$$
%\end{itemize}
%then $\ds
%\sum_{i=1}^NU_{N,i}\longrightarrow_\P 0.
%$
%\label{LGN1}
%\end{theorem}
%\begin{theorem}
%If
%\begin{itemize}
%\item[(i)] $\{U_{N,i}\}_{N\geq 1,1\leq i\leq N}$ is independent given $\{\mathcal{G}_N\}_{N\geq 1}$;
%\item[(ii)] $\ds
%\forall N\geq 1,\forall i\in\{1,\ldots,N\},\quad \E[|U_{N,i}||\mathcal{G}_N]<\infty\quad\mbox{and}\quad \E[U_{N,i}|\mathcal{G}_N]=0\quad;
%\label{LGN2(ii)}$
%\item[(iii)]$\ds
%\mbox{the sequence} \quad
%\left\{\sum_{i=1}^N\E[|U_{N,i}||\mathcal{G}_N]\right\}_{N\geq 1} \quad
%\mbox{is bounded in probability}\quad ;$
%\item[(iv)] $\forall\eta>0$, $\ds
%\sum_{i=1}^N\E[|U_{N,i}|\mathbb{I}_{|U_{N,i}|>\eta}|\mathcal{G}_N]\longrightarrow_\P 0\quad;
%$
%\end{itemize}
%then
%$\ds
%\sum_{i=1}^NU_{N,i}\longrightarrow_\P 0.
%$
%\label{LGN2}
%\end{theorem}
\begin{theorem}
If
\begin{itemize}
\item[(i)] $\{U_{N,i}\}_{N\geq 1,1\leq i\leq N}$ is independent given $\{\mathcal{G}_N\}_{N\geq 1}$;
\item[(ii)] $\ds
\mbox{the sequence} \quad
\left\{\sum_{i=1}^N\E[|U_{N,i}||\mathcal{G}_N]\right\}_{N\geq 1} \quad
\mbox{is bounded in probability}\quad;$
\item[(iii)] $\forall \eta>0$, $\ds
\sum_{i=1}^N\E[|U_{N,i}|\mathbb{I}_{|U_{N,i}|>\eta}|\mathcal{G}_N]\longrightarrow_\P 0\quad ;
$
\end{itemize}
then $\ds
\sum_{i=1}^N\left(U_{N,i}-\E[U_{N,i}|\mathcal{G}_N]\right)\longrightarrow_\P 0.
$
\label{LGN}
\end{theorem}
%\begin{theorem}
%If
%\begin{itemize}
%\item[(i)] $\{U_{N,i}\}_{N\geq 1,1\leq i\leq N}$ is independent given $\{\mathcal{G}_N\}_{N\geq 1}$;
%\item[(ii)] $\ds
%\forall N\geq 1,\forall i\in\{1,\ldots,N\},\quad \E[|U_{N,i}||\mathcal{G}_N]<\infty\quad\mbox{and}\quad \E[U_{N,i}|\mathcal{G}_N]=0\quad;
%$
%\item[(iii)] $\exists\sigma^2>0$ such that $\ds
%\sum_{i=1}^N\E[U_{N,i}^2|\mathcal{G}_N]\longrightarrow_\P\sigma^2\quad;
%$
%\item[(iv)] $\ds
%\forall\eta>0,\quad \sum_{i=1}^N\E[U_{N,i}^2\mathbb{I}_{|U_{N,i}|>\eta}|\mathcal{G}_N]\longrightarrow_\P 0\quad;
%$
%\end{itemize}
%then,
%$\ds
%\forall u\in\mathbb{R},\quad \E\left[\left.\exp\left(iu\sum_{i=1}^NU_{N,i}\right)\right|\mathcal{G}_N\right]\longrightarrow_\P
%\exp\left(-0.5 u^2\sigma^2\right).
%$
%\label{TCL1}
%\end{theorem}
\begin{theorem}
If
\begin{itemize}
\item[(i)] $\{U_{N,i}\}_{N\geq 1,1\leq i\leq N}$ is independent given $\{\mathcal{G}_N\}_{N\geq 1}$;
\item[(ii)] $\ds
\forall N\geq 1,\forall i\in\{1,\ldots,N\},\quad \E[|U_{N,i}||\mathcal{G}_N]<\infty\quad;
$
\item[(iii)] $\exists\sigma^2>0$ such that $\ds
\sum_{i=1}^N\left(\E[U_{N,i}^2|\mathcal{G}_N]-\left(\E[U_{N,i}|\mathcal{G}_N]\right)^2\right)\longrightarrow_\P\sigma^2\quad;
$
\item[(iv)] $\ds
\forall\eta>0,\quad \sum_{i=1}^N\E[U_{N,i}^2\mathbb{I}_{|U_{N,i}|>\eta}|\mathcal{G}_N]\longrightarrow_\P 0\quad;
$
\end{itemize}
then,
$$
\forall u\in\mathbb{R},\quad \E\left[\left.\exp\left(iu\sum_{i=1}^N\left(U_{N,i}-\E[U_{N,i}|\mathcal{G}_N]\right)\right)\right|\mathcal{G}_N\right]
\longrightarrow_\P\exp\left(-\frac{u^2\sigma^2}{2}\right).
$$
\label{TCL}
\end{theorem}
\end{document}