\def\BirthArrow{ \hbox{${\buildrel {\displaystyle\nearrow} \over {\scriptstyle{\beta_n}}}$} }
\def\DeathArrow{ \hbox{${\buildrel                  n      \over               \swarrow} $} }

\def\LineTwo  {\hbox{{\raise2pt\hbox to 80pt{\hrulefill}} \hskip -78pt $\bullet$                                             \hskip 56pt $\bullet$ \hskip 2pt}}
\def\LineThree{\hbox{{\raise2pt\hbox to 80pt{\hrulefill}} \hskip -78pt $\bullet$                       \hskip 35pt $\bullet$ \hskip 13pt $\bullet$ \hskip 2pt}}
\def\LineFour {\hbox{{\raise2pt\hbox to 80pt{\hrulefill}} \hskip -78pt $\bullet$ \hskip 10pt $\bullet$ \hskip 17pt $\bullet$ \hskip 13pt $\bullet$ \hskip 2pt}}

\def\LINEthree{\hbox{{\raise2pt\hbox to 94pt{\hrulefill}} \hskip -92pt $\bullet$ \hskip 6pt $\bullet$  \hskip 56pt $\bullet$ \hskip 2pt}}
\def\LINEfour {\hbox{{\raise2pt\hbox to 94pt{\hrulefill}} \hskip -92pt $\bullet$ \hskip 6pt $\bullet$
                                                                                 \hskip 35pt $\bullet$ \hskip 13pt $\bullet$ \hskip 2pt}}
\def\LINEfive {\hbox{{\raise2pt\hbox to 94pt{\hrulefill}} \hskip -92pt $\bullet$ \hskip 6pt $\bullet$
                                                           \hskip 10pt $\bullet$ \hskip 17pt $\bullet$ \hskip 13pt $\bullet$ \hskip 2pt}}

\def\l{\hbox{\vrule height 5.0pt width 0.2pt depth 0.0pt
             \vrule height 0.2pt width 4.8pt depth 0.0pt}}
\def\j{\hbox{\vrule height 5.0pt width 0.2pt depth 0.0pt
             \vrule height 0.0pt width 4.8pt depth 0.0pt}}
\def\z{\hbox{\vrule height 5.0pt width 0.0pt depth 0.0pt
             \vrule height 0.2pt width 5.0pt depth 0.0pt}}
\def\o{\hbox{\vrule height 0.2pt width 0.2pt depth 0.0pt
             \vrule height 0.0pt width 4.8pt depth 0.0pt}}
\def\n{\hbox{\vrule height 5.0pt width 0.0pt depth 0.0pt
             \vrule height 0.0pt width 5.0pt depth 0.0pt}}
\def\L{\hbox{\vrule height 5.0pt width 0.5pt depth 0.0pt
             \vrule height 0.5pt width 4.5pt depth 0.0pt}}
\def\I{\hbox{\vrule height 5.0pt width 0.5pt depth 0.0pt
             \vrule height 0.0pt width 4.5pt depth 0.0pt}}
\def\Z{\hbox{\vrule height 5.0pt width 0.0pt depth 0.0pt
             \vrule height 0.5pt width 5.0pt depth 0.0pt}}
\def\O{\hbox{\vrule height 0.5pt width 0.5pt depth 0.0pt
             \vrule height 0.0pt width 4.5pt depth 0.0pt}}
\def\La{\hbox{\vrule height   1.4pt width   1.0pt depth 1.0pt
              \vrule height 104.0pt width   0.4pt depth 1.0pt
              \vrule height   1.4pt width   1.0pt depth 1.0pt
              \vrule height   0.4pt width 102.6pt depth 0.0pt}}
\def\Ia{\hbox{\vrule height   1.4pt width   1.0pt depth 1.0pt
              \vrule height 104.0pt width   0.4pt depth 1.0pt
              \vrule height   1.4pt width   1.0pt depth 1.0pt
              \vrule height   0.0pt width 102.6pt depth 0.0pt}}
\def\Za{\hbox{\vrule height   1.4pt width   2.4pt depth 1.0pt
              \vrule height   0.4pt width 102.6pt depth 0.0pt}}
\def\Oa{\hbox{\vrule height   1.4pt width   2.4pt depth 1.0pt
              \vrule height   0.0pt width 102.6pt depth 0.0pt}}

\def\Lb{\hbox{\vrule height  1.4pt width  1.0pt depth 1.0pt
              \vrule height 34.0pt width  0.4pt depth 1.0pt
              \vrule height  1.4pt width  1.0pt depth 1.0pt
              \vrule height  0.4pt width 32.6pt depth 0.0pt}}
\def\Ib{\hbox{\vrule height  1.4pt width  1.0pt depth 1.0pt
              \vrule height 34.0pt width  0.4pt depth 1.0pt
              \vrule height  1.4pt width  1.0pt depth 1.0pt
              \vrule height  0.0pt width 32.6pt depth 0.0pt}}
\def\Zb{\hbox{\vrule height  1.4pt width  2.4pt depth 1.0pt
              \vrule height  0.4pt width 32.6pt depth 0.0pt}}
\def\Ob{\hbox{\vrule height  1.4pt width  2.4pt depth 1.0pt
              \vrule height  0.0pt width 32.6pt depth 0.0pt}}

\def\Lc{\hbox{\vrule height  1.4pt width  1.0pt depth 1.0pt
              \vrule height 14.0pt width  0.4pt depth 1.0pt
              \vrule height  1.4pt width  1.0pt depth 1.0pt
              \vrule height  0.4pt width 12.6pt depth 0.0pt}}
\def\Ic{\hbox{\vrule height  1.4pt width  1.0pt depth 1.0pt
              \vrule height 14.0pt width  0.4pt depth 1.0pt
              \vrule height  1.4pt width  1.0pt depth 1.0pt
              \vrule height  0.0pt width 12.6pt depth 0.0pt}}
\def\Zc{\hbox{\vrule height  1.4pt width  2.4pt depth 1.0pt
              \vrule height  0.4pt width 12.6pt depth 0.0pt}}
\def\Oc{\hbox{\vrule height  1.4pt width  2.4pt depth 1.0pt
              \vrule height  0.0pt width 12.6pt depth 0.0pt}}

\def\Ld{\hbox{\vrule height 1.4pt width 1.0pt depth 1.0pt
              \vrule height 6.0pt width 0.4pt depth 1.0pt
              \vrule height 1.4pt width 1.0pt depth 1.0pt
              \vrule height 0.4pt width 4.6pt depth 0.0pt}}
\def\Id{\hbox{\vrule height 1.4pt width 1.0pt depth 1.0pt
              \vrule height 6.0pt width 0.4pt depth 1.0pt
              \vrule height 1.4pt width 1.0pt depth 1.0pt
              \vrule height 0.0pt width 4.6pt depth 0.0pt}}
\def\Zd{\hbox{\vrule height 1.4pt width 2.4pt depth 1.0pt
              \vrule height 0.4pt width 4.6pt depth 0.0pt}}
\def\Od{\hbox{\vrule height 1.4pt width 2.4pt depth 1.0pt
              \vrule height 0.0pt width 4.6pt depth 0.0pt}}

\def\ld{\hbox{\vrule height 7.0pt width 0.3pt depth 0.0pt
              \vrule height 0.3pt width 6.7pt depth 0.0pt}}
\def\id{\hbox{\vrule height 7.0pt width 0.3pt depth 0.0pt
              \vrule height 0.0pt width 6.7pt depth 0.0pt}}
\def\zd{\hbox{\vrule height 0.3pt width 7.0pt depth 0.0pt}}
\def\od{\hbox{\vrule height 0.0pt width 7.0pt depth 0.0pt}}
\def\oo{\hbox{\phantom{\vrule height 7.0pt width 7.0pt depth 0.0pt}}}

\def\Da{$\ \ {\cal D}_0\ $}
\def\Db{$\ \ {\cal D}_1\ $}
\def\Dc{$\ \ {\cal D}_2\ $}
\def\Dd{$\ \ {\cal D}_3\ $}
\def\P{\phantom{$\ \ {\cal D}_3\ $}}
\def\H{\hfill\vrule\hfill}

\def\bfpi{\hbox{$\pi$ \hskip -9.2pt $\pi$ \hskip -9.2pt $\pi$ \hskip -9.2pt $\pi$ \hskip -9.2pt $\pi$}}

\font\biggest=cmr10 scaled\magstep5
\font\bigger=cmr10 scaled\magstep2

\footline={\tenrm\qquad John Skilling\quad\hfil Maximum Entropy Data Consultants Ltd (est.1981)\hfil\quad February 2004\qquad}

\vskip 6cm
\centerline{\biggest BayeSys and MassInf}

\vskip 3cm
$$
\hbox{
\lower 12pt\vbox{\hbox{\ $n$}
                 \hbox{\ or}
                 \hbox{$n-$1}}
                }
\longleftarrow
\Bigg\{
\lower 12pt\vbox{ \hbox{\hskip 2pt {$\scriptstyle\rm Left$} \hskip 29.5pt $\scriptstyle\downarrow$ \hskip 6pt {$\scriptstyle\rm Right$}}
                  \LineThree
                  \hbox{\hskip 6pt \hbox to 25pt{\leftarrowfill} Death $\rightarrow$}
                  \LineTwo
                }
\Bigg\}
\hskip10pt \raise 2pt\DeathArrow \hskip10pt
\raise 12pt\LineThree
\hskip10pt \raise 14pt\BirthArrow \hskip10pt
\raise 24pt\hbox{$
                  \Bigg\{
                  \lower 12pt\vbox{ \hbox{\hskip 2pt {$\scriptstyle\rm Left$} \hskip 5pt $\scriptstyle\downarrow$ \hskip 10pt {$\scriptstyle\rm Right$}}
                                    \LineFour
                                    \hbox{\hskip 6pt $\leftarrow\,$Birth$\,\rightarrow$}
                                    \LineThree
                                  }
                  \Bigg\}
                  \longrightarrow
                  \hbox{\lower 12pt \vbox{ \hbox{$n$+1}
                                           \hbox{\ or}
                                           \hbox{\ $n$}}
                                         }
                $}
$$

\vskip 1cm
$$
\hbox
{
\vbox{\lineskip = 0pt \baselineskip = 0pt
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\j\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z}
      \hbox{\z\z\z\o\z\j\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z}
      \hbox{\z\o\z\j\l\o\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l}
      \hbox{\n\j\j\z\o\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o}
      \hbox{\z\j\l\j\l\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\z\o\z\z\o\z\z\o\j\Z\O\Z\O\Z\O\Z\O\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\z\o\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\I\L\I\I\I\L\I\I\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\L\O\Z\I\L\O\Z\I\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\j\Z\I\L\Z\Z\I\L\O\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\j\l\z\j\l\z\j\j\j\l\z}
      \hbox{\z\z\z\o\z\j\l\o\z\z\z\o\z\j\l\o\z\z\z\o\z\j\I\Z\Z\O\Z\Z\O\I\Z\O\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\l\o\z\z\z\o\z\j\l\o\z}
      \hbox{\z\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\L\I\Z\I\L\O\L\I\I\n\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\z\j\l}
      \hbox{\n\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\Z\O\L\O\Z\I\Z\O\L\O\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\j\z\o}
      \hbox{\z\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\I\L\Z\I\L\Z\I\L\Z\I\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\l\j\l}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\n\O\Z\O\Z\O\I\Z\Z\O\Z\Z\O\Z\Z\O\Z\Z\O\Z\Z\O\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\o\I\I\L\I\I\L\I\Z\I\L\O\L\I\Z\I\L\O\L\I\Z\I\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\Z\I\L\O\Z\I\Z\O\L\O\Z\I\Z\O\I\Z\O\I\Z\O\L\O\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\L\Z\Z\I\L\O\I\L\Z\I\L\Z\I\I\L\I\L\I\I\L\Z\I\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\z\j\l\z\z\j\l\o\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\Z\Z\O\I\L\O\Z\Z\Z\O\Z\I\n\O\Z\O\I\Z\Z\O\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\j\z\z\o\z\z\o\j\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\L\O\L\I\Z\I\L\O\Z\I\L\O\n\L\I\I\L\I\Z\I\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\Z\I\Z\O\I\Z\O\I\I\Z\O\I\L\O\Z\I\Z\O\L\O\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\L\Z\I\I\L\I\L\I\L\I\L\I\Z\I\L\Z\I\L\Z\I\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\l\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\Z\Z\O\Z\Z\O\Z\I\Z\O\Z\O\Z\O\Z\O\L\O\Z\Z\O\Z\Z\O\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\L\O\L\I\Z\I\L\O\I\L\I\I\I\L\I\I\Z\I\L\O\L\I\Z\I\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\Z\I\Z\O\I\Z\O\I\L\O\Z\I\L\O\Z\I\I\Z\O\I\Z\O\L\O\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\L\Z\I\I\L\I\L\I\Z\I\L\Z\Z\I\L\O\L\I\L\I\I\L\Z\I\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\Z\Z\O\I\Z\O\Z\O\I\Z\Z\O\Z\Z\O\I\Z\O\Z\O\I\Z\Z\O\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\L\O\L\I\I\L\I\I\L\I\Z\I\L\O\L\I\I\L\I\I\L\I\Z\I\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\Z\I\Z\O\L\O\Z\I\Z\O\L\O\Z\I\Z\O\L\O\Z\I\Z\O\L\O\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\z\z\j\l\z\j\l\z\j\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n}
     }
\hskip -177.6pt \vbox{\hbox{$\bigcirc$ \hskip -11.5pt ${\scriptstyle R}$} \vskip116.2pt \hbox{ } }
\hskip  -86.4pt \vbox{\hbox{$\bigcirc$ \hskip -11.0pt ${\scriptstyle L}$} \vskip100.0pt \hbox{ } }
\hskip   58.9pt \vbox{\hbox{$\bigcirc$ \hskip -11.7pt ${\scriptstyle X}$} \vskip 80.2pt \hbox{ } }
\hskip  172pt
}  
$$

\vskip 1cm
$$
\vbox{
      \hbox{\tt ..o..oo....oo.oo.....oo.ooo....ooooooo..ooooo...ooo.o...o...o....o...}
      \hbox{
            \hskip 45.4pt    \hbox to 16.0pt{\leftarrowfill} $\!\!\mid\!\!$ \hbox to 16.0pt{\rightarrowfill}
            \hskip 30.3pt    \hbox to 16.0pt{\leftarrowfill} $\!\!\mid\!\!$ \hbox to 16.0pt{\rightarrowfill}
            \hskip 20.0pt    \hbox to 10.5pt{\leftarrowfill} $\!\!\mid\!\!$ \hbox to 10.5pt{\rightarrowfill}
            \hskip 15.0pt    \hbox to 10.5pt{\leftarrowfill} $\!\!\mid\!\!$ \hbox to 10.5pt{\rightarrowfill}
            \hskip 19.8pt    \hbox to 31.5pt{\leftarrowfill} $\!\!\mid\!\!$ \hbox to 31.5pt{\rightarrowfill}}
     }
$$

\vfill\eject
\footline={\tenrm\qquad John Skilling, Kenmare, Ireland\hfil\quad February 2004\qquad}

\centerline{\bigger BayeSys and MassInf}
\bigskip
\bigskip
\bigskip
\bigskip
\bigskip
\bigskip
{\bigger
\noindent To my intellectual ancestors the late Edwin T. Jaynes, and Steve Gull,  to my descendant Sibusiso Sibisi, 
and the many colleagues and friends over the past quarter-century who have inspired and encouraged the development of these ideas.}
\vskip 13cm
\noindent ${\rm BayeSys}^{TM}$ and ${\rm MassInf}^{TM}$ are trademarks of Maximum Entropy Data Consultants Ltd, 114c Milton Road, Cambridge, England.
Copyright of this manual and the accompanying source code modules listed in section 16 is assigned to Maximum Entropy Data Consultants Ltd.
These program modules are distributed in the public domain under the terms of the GNU Lesser General Public License (version 2.1) available from the 
Free Software Foundation Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.

\vfill\eject

\footline={\hss\tenrm\folio\hss}
\pageno=1
\centerline{CONTENTS}
\halign{    # \hfil& # \dotfill$\ldots\ldots\ldots\ldots\ldots\ $          &  # \cr
         PART 1.   &\ 1. Introduction                                      &\ 2 \cr
         OVERVIEW: &\ 2. Overview of Inference                             &\ 4 \cr
                   &\qquad 2.1. Inversion                                  &\ 4 \cr
                   &\qquad 2.2. Regularisation                             &\ 4 \cr
                   &\qquad 2.3. Probabilities                              &\ 5 \cr
                   &\qquad 2.4. Prior probabilities                        &\ 5 \cr
                   &\qquad\qquad 2.4.1. Atomic priors                      &\ 6 \cr
                   &\qquad\qquad 2.4.2. Coordinates                        &\ 7 \cr
                   &\qquad 2.5. Sampling                                   &\ 8 \cr
         PART 2.   &\ 3. Markov chain Monte Carlo (MCMC)                   &\ 9 \cr
         THEORY:   &\qquad 3.1. The number of atoms                        &\ 9 \cr
                   &\qquad 3.2. Coordinates                                & 10 \cr
                   &\qquad 3.3. R\^ole of likelihood                       & 12 \cr
                   &\qquad 3.4. Binary slice sampling                      & 13 \cr
                   &\ 4. Annealing                                         & 15 \cr
                   &\qquad 4.1. Selective annealing                        & 17 \cr
                   &\qquad\qquad 4.1.1. Imperfections                      & 18 \cr
                   &\qquad\qquad 4.1.2. Properties                         & 18 \cr
                   &\qquad 4.2. Comparison with statistical thermodynamics & 19 \cr
                   &\ 5. The BayeSys program                               & 21 \cr
                   &\qquad 5.1. The BayeSys prior                          & 22 \cr
                   &\qquad\qquad 5.1.1. Number of atoms                    & 22 \cr
                   &\qquad\qquad 5.1.2. Coordinates                        & 23 \cr
                   &\qquad 5.2. The BayeSys engines                        & 26 \cr
                   &\qquad\qquad 5.2.1. LifeStory1                         & 27 \cr
                   &\qquad\qquad 5.2.2. LifeStory2                         & 29 \cr
                   &\qquad\qquad 5.2.3. GuidedWalk                         & 30 \cr
                   &\qquad\qquad 5.2.4. Leapfrog1 and Leapfrog2            & 32 \cr
                   &\qquad\qquad 5.2.5. Chameleon1                         & 33 \cr
                   &\qquad\qquad 5.2.6. Chameleon2                         & 34 \cr
                   &\ 6. Massive Inference (MassInf)                       & 35 \cr
                   &\qquad 6.1. MassInf priors                             & 35 \cr
                   &\qquad 6.2. MassInf likelihood                         & 36 \cr
                   &\qquad\qquad 6.2.1.  Gaussian data                     & 37 \cr
                   &\qquad\qquad 6.2.2.  Poisson data                      & 37 \cr
                   &\qquad 6.3. MassInf flux unit                          & 37 \cr
                   &\qquad 6.4. MassInf fluxes                             & 38 \cr
                   &\ 7. Display of results                                & 39 \cr
         PART 3.   &\ 8. BayeSys prior parameters                          & 40 \cr
         PRACTICE: &\ 9. BayeSys algorithm parameters                      & 41 \cr
                   & 10. BayeSys structures                                & 42 \cr
                   & 11. User procedures                                   & 44 \cr
                   & 12. UserMonitor                                       & 45 \cr
                   & 13. MassInf prior parameters                          & 46 \cr
                   & 14. MassInf likelihood settings                       & 47 \cr
                   &\qquad 14.1  Gaussian data                             & 48 \cr
                   &\qquad 14.2  Poisson data                              & 48 \cr
                   &\qquad 14.3. MassInf with BayeSys                      & 48 \cr
                   & 15. Using BayeSys                                     & 49 \cr
                   & 16. Program files                                     & 50 \cr
                   & REFERENCES                                            & 51 \cr
                   & INDEX                                                 & 53 \cr
       }
\vfill\eject

\centerline{\bigger PART 1. OVERVIEW}
\bigskip
\noindent{$\underline{\hbox{\bf{Section 1. Introduction}}}$}
\bigskip

The second half of the $20^{\rm th}$ century saw a revolution in methods of inference. 
On the theoretical side was the rise of ``Bayesian'' probabilistic analysis,
and on the practical side was the development of computer hardware along with the exploration algorithms to use it.
At times in those decades, the theoretical arguments became heated to a degree more often associated with the darker side of organised religion,
with the guardians of doctrinal orthodoxy ranged against the Bayesian rationalists to the bemusement of ordinary scientists who simply wanted to analyse their data.
The dispute ought to have been settled by the paper of Richard Cox (1946), 
which proved that straightforward probabilities are the only allowable method of consistent inference.
But it was not.
Edwin Jaynes (2003) presents the Cox proof in compelling detail in chapter 2 of his book (posthumously edited by Larry Bretthorst),
and also gives the history (chapter 16) and pathology (chapter~17) of orthodox statistics, with the stylistic flair of the wartime correspondent that, in a way, he was.
He who engages with irrationality should expect an unreasoning response, 
and sadly that is what Jaynes and like-minded colleagues all too often received for their efforts.

Yet Nature has a way of educating us.
In inference as in engineering, practical power comes from obeying the laws, and working in sympathy with them.
Null hypotheses, confidence intervals and the like cannot cope with the complexity of modern problems.
Probabilistic analysis demonstrably does.
It is this wide-ranging power, more than logical argument, that is convincing scientists at large that the Bayesian approach should be used.
Sivia (1996) gives a good introduction to Bayesian data analysis from a scientist's perspective.
Though, to be fair to the orthodox school, it wasn't much use knowing what one should do if one didn't know how.
The practical development of Bayesian methods can be conveniently dated to Metropolis {\it et.al.} (1953), 
who presented a Monte Carlo exploration algorithm having a real prospect of dealing with large problems.
Hastings (1970) extended and generalised this work, and nowadays most serious inference algorithms follow the Metropolis-Hastings approach.
The manufacture of ever-larger computers has allowed application to ever-harder problems, 
which have catalysed the development of a bewildering array of algorithmic developments.
Brooks (1998) gives a clearly written review, and MacKay (2003) gives a professional introduction to Monte Carlo methods from a wider perspective.

The defining feature of large problems is that they have a large number of parameters, viewed geometrically as a dimensionality.
Large dimensionality has several awkward properties: most directions are nearly orthogonal to the one you want, 
the interesting domain you seek is usually exponentially tiny,
and it is likely to have a peculiar shape as well.
It is a lot harder to program in $n$ dimensions than in 1.
Yet $n$ dimensions can be mapped to 1 by using a space-filling curve.
Specifically, the Hilbert curve (1891) uniformly covers the interior of a $n$-dimensional cube, whilst preserving a useful degree of locality.
Using it, $n$ coordinates can be encoded into a single number, albeit one with extended precision so that accuracy is preserved.
Curves like this have been considered something of a curiosity, 
and have attracted only a few quirky applications (Abend, Hartley \& Kanal 1965, Bially 1969, Stevens, Lehar \& Preston 1983, Song \& Roussopoulos 2002).
Indeed, to anyone classically trained in differential calculus and continuum mathematics, space-filling curves do look rather odd.
However, from a computational point of view, using a Hilbert curve merely amounts to shuffling and re-defining the bits representing coordinate values, 
which is not particularly peculiar.

Of course, there is a price to pay.
A function that is smooth and straightforward in two, three or more dimensions will look sharp and jagged when stretched out along a Hilbert curve.
On the other hand, a function that is twisted and torn in several dimensions doesn't look appreciably worse.
So, if we can work in one dimension at all (and we can), we may hope to be able to work with a wide class of functions that have traditionally been regarded as difficult.
Indeed, I suggest that the ability to alter the dimensionality of a problem at will is a powerful and under-appreciated tool, 
which we should be able to use with advantage when exploring spaces of high dimension.
Such an approach implies a change of focus, away from the lines and curvatures of geometry, and towards the connected-ness of topology.

The BayeSys program (an acronym for {\bf Baye}sian {\bf Sys}tem, pronounced ``basis'') is built around this basic idea.
Its aim is to give you ``typical'' samples of your entire object of interest, 
which the program locates by using the object's probabilistic fit to your data, known as its likelihood.
It also calculates the numerical ``evidence'' that quantifies how well your modelling of objects managed to predict your data.
BayeSys supplies locations, you supply their likelihoods: it's just about as simple as that.
The program includes a variety of exploration procedures which I call ``engines'', all of which exploit Hilbert coding to reduce the dimensionality.
Using a space-filling curve as a central theme is unconventional and distinctive: 
I offer it as an alternative to currently popular strategies, 
hoping that it will have at least as much power in a wide variety of probabilistic and optimisation applications.
Much else in BayeSys is new too.
There is ``selective annealing'', 
which is a form of simulated annealing that is designed to cope properly with difficult problems which may have multiple maxima.
There is a binary variant of slice sampling used for exploration along the Hilbert curve.
There are linked lists for locating neighbouring objects, which can catalyse each other's evolution.
The ``Massive Inference'' extension (``MassInf'' for short) copes semi-analytically with coordinates representing additive quantities such as intensity or flux, 
which eases the exploration task.
These and other coding tricks are hidden within the program.
You, the user, see only a simple interface.

Section 2 of this manual gives an overview of inference as appropriate to BayeSys and MassInf, and Sections 3 to 7 give the theory underlying the program.
I have tried to present this material straightforwardly.
After all, it involves nothing more than some easy algebra with occasional use of basic calculus.
I have not aimed to present maximal generality, in notation and terminology that are then necessarily opaque: 
that just makes the subject look difficult.
Moreover, generality is often the enemy of efficiency as well as of clarity.
Instead, I present the ideas in simple terms and conversational style, referring to the literature where connections exist.
Generalisation is usually obvious anyway, once an idea is understood.
All this theoretical material is for the reader who wants to know how it all works.  Otherwise, it's optional.
The later sections 8 to 15 describe the requirements for writing an application, and Section 16 lists the program files that should be provided with BayeSys.
These include ``toy'' example programs, which can serve as templates for your own application's code.
The program language is ANSI C.

\vfill\eject
\noindent{$\underline{\hbox{\bf{Section 2. Overview of Inference}}}$}
\bigskip

For us, inference is the art of recovering an object $\theta$ from data $D$.
Usually, the object will be something quite complicated, perhaps a spectrum or an image, 
having at least several and maybe thousands or millions of degrees of freedom.  The data, acquired through some experiment $R$ so that
$$
D \approx R(\theta)
$$
will nearly always be incomplete (fewer components than $\theta$ has) or noisy (inexact), or somehow inadequate to fix $\theta$ unambiguously.
Methods of inference fall into three increasingly sophisticated classes, ``inversion'', ``regularisation'' and ``probabilistic''.

I can illustrate these with the population of the four countries (England, Scotland, Wales and Ireland) that comprise the British Isles.
I have received data (mythical, of course) that of the entire population of 10000, 
8000 live in England and Scotland, and 7500 live in England and Wales.  
From these three numbers, I have to estimate the populations $\theta = (\theta_1, \theta_2, \theta_3, \theta_4)$ of the four countries.

\centerline{\vbox{\vskip 4pt
\offinterlineskip
      \halign{  & \vrule# &
                   \strut\quad\hfil#\quad &
                                             \vrule# &
                                               \strut\quad\hfil#\quad &
                                                                         \vrule# &
                                                                       \strut\quad\hfil# \cr
                        \multispan5\hrulefill                                            \cr
                height2pt &    \omit           &     &   \omit             &     & \omit \cr
                          &Ireland = $\theta_4$&     &Scotland = $\theta_2$&     &  2500 \cr
                height2pt &    \omit           &     &   \omit             &     & \omit \cr
                        \multispan5\hrulefill                                            \cr
                height2pt &    \omit           &     &   \omit             &     & \omit \cr
                          &Wales   = $\theta_3$&     &England  = $\theta_1$&     &  7500 \cr
                height2pt &    \omit           &     &   \omit             &     & \omit \cr
                        \multispan5\hrulefill                                            \cr
                   \omit  &       2000         &\omit&     8000            &\omit& 10000 \cr
          }
       } }
In algebraic terms,
$$
  D = \left[ \matrix{ 2500 \cr
                      7500 \cr
                      2000 \cr
                      8000 \cr } \right]
    = \left[ \matrix{ 0 & 1 & 0 & 1 \cr
                      1 & 0 & 1 & 0 \cr
                      0 & 0 & 1 & 1 \cr
                      1 & 1 & 0 & 0 \cr } \right] 
                                   \left[ \matrix{ \theta_1 \cr
                                                   \theta_2 \cr
                                                   \theta_3 \cr
                                                   \theta_4 \cr } \right]
    = R \theta.
$$

\bigskip
\noindent{2.1. INVERSION}
\smallskip

Inversion in its pure form inverts $R$ to obtain the estimate $ \hat\theta = R^{-1} D $, 
which might work except that $R$ is here (and in general) singular so that there is no inverse.  
A common fixup is to use a pseudo-inverse instead, so that $ \hat\theta = X D $ where $X$ approximates the inverse of $R$.  
But the data which we would have obtained if this were true are $R X D$ which, insofar as $X$ is {\bf not} the inverse of $R$, differ from the true data, 
showing $\hat\theta$ cannot be correct.  
So much for inversion.  
This method should only be used when the results are essentially unambiguous, or don't matter much.

\bigskip
\noindent{2.2. REGULARISATION}
\smallskip

In my illustrative problem, simple logic shows that the data leave only one degree of freedom. 
Let this be the Irish population $x$.  
Then
$$
  \theta_4 = x,\quad \theta_3 = 2000-x,
  \quad \theta_2 = 2500-x,\quad \theta_1 = 5500+x,
$$
and this satisfies the data for any $x$.  
Regularisation consists of finding the ``best'' result from among all those that agree with the data, 
as defined by maximising some regularisation function $\Phi(\theta)$ representing quality. 
The general method is usually attributed to Tikhonov \& Arsenin (1977). 
The commonest such method is least squares, which here fixes $x$ by minimising 
$$
  \theta_1^2 + \theta_2^2 + \theta_3^2 + \theta_4^2 = -\Phi(\theta)
$$
subject to agreement with the data.  
Unfortunately the selected value of $x$ is $-250$, indicating that the population of Ireland was negative.
Oops!  Least squares is simple, but not necessarily best.

For the sort of positive distribution being considered, maximum entropy (maximise $ \Phi = -\sum \theta_i \log\theta_i $) is useful.  
Negative populations are prohibited by the logarithm, and the result

\centerline{\vbox{\vskip 4pt
\offinterlineskip
      \halign{ & \vrule#  &
             \strut\quad\hfil#\hfil\quad &
                                       \vrule# &
                                   \strut\quad\hfil#\hfil\quad &
                                                             \vrule#  \cr
                        \noalign{\hrule}                              \cr
                height2pt &    \omit     &     &   \omit       &      \cr
                          &Ireland =  500&     &Scotland = 2000&      \cr
                height2pt &    \omit     &     &   \omit       &      \cr
                        \noalign{\hrule}                              \cr
                height2pt &    \omit     &     &   \omit       &      \cr
                          &Wales   = 1500&     &England  = 6000&      \cr
                height2pt &    \omit     &     &   \omit       &      \cr
                        \noalign{\hrule}                              \cr
          }
       } }
\noindent
has the 1:3 north:south ratio nicely independent of longitude, and the 1:4 east:west ratio nicely independent of latitude.  
In fact, entropy is the only function yielding this general symmetry, as was proved by Shore \& Johnson (1980,1983), also with less formality by Gull \& Skilling (1984).
This may well account for the high quality of many maximum-entropy reconstructions.
If there is a general regularisation method at all, it has to apply in special cases, and maximum entropy is the only sensible method in this particular special case.
But questions remain.  How should we account for noise in the data?
What about nuisance parameters?  How uncertain is the ``best'' result?

\bigskip
\noindent{2.3. PROBABILITIES}
\smallskip

In 1946, R.T. Cox proved that the only way of doing inference consistently is through probability calculus (Cox, 1946) and seeking the posterior probability
distribution $\Pr(\theta\mid D)$ of result $\theta$, given data $D$.
We start by assigning a prior distribution representing what we think $\theta$ might be, and modulate this with how likely the data measurements would then be.
This gives the joint distribution of $\theta$ and $D$, which can be alternatively expanded as the posterior we want, 
scaled by a coefficient called the ``prior predictive'' --- or ``evidence'' for short.
$$
\matrix {
      \Pr(\theta,D) & = &\Pr(\theta) \times \Pr(D\mid\theta)   &    =      &\Pr(\theta\mid D) \times \Pr(D)         &\qquad\qquad\parallel I \cr
       \hbox{Joint} &   &\hbox{Prior} \times \hbox{Likelihood} &           &\hbox{Posterior} \times \hbox{Evidence} &                        \cr
                    &   &\hbox{assumptions \& measurements}    &\Rightarrow&                 \hbox{inference}       &                        \cr
        }
$$
That's the famous, and stunningly elementary, {\bf Bayes' Theorem}.
In writing it, I have explicitly included the dependence on whatever contextual information $I$ is to hand, through the ``$\parallel I$'' qualification.
All probabilities in the equation are to be understood as conditioned on $I$, though in practice one often omits the symbol when the context is unambiguous.

Be careful, though.  
Wonderfully confusing paradoxes can follow if you inadvertently alter the context in the middle of a probability derivation, and don't notice.
It's just the same with other general terms such as ``average''.
In the ordinary arithmetical case, the average of 10 and 40 is $(10+40)/2 = 25$,
but if the situation is logarithmic, the average becomes $\sqrt{10 \times 40} = 20$ instead.
Change the context in the middle of your calculation and you may ``prove'' \hbox{$25=20$}.
However, we would not consider that such a discrepancy implied a contradiction within the theory of averages: it's merely a mistake.
Yet exactly such claim of contradiction was actually made about probability theory (Dawid, Stone and Zidek 1973).
Jaynes (2003) details the shameful saga.
The danger with a probabilistic calculation is that the level of abstraction is likely to be greater,
so that the mistake can be less obvious and the conclusion may be seductively misleading.
To avoid such error, just take care to track the context.

Observe that the evidence amounts to the likelihood $\Pr(D\mid I)$ for the contextual information.
We would use exactly the same Bayes' Theorem to compare different contexts $I$ (under more general background information~$J$) 
as we do here to compare different parameter values $\theta$: it is only the identifications of the symbols that would change.
If we don't have any alternatives for $I$, the evidence is useless to us, 
though in principle we should still calculate it and publish it in case somebody else can think of an alternative that might explain the data better
(or worse, if their alternative's evidence turns out to be smaller).
Always calculate the evidence (its units are inverse data) as an integral part of Bayesian inference.  
It's a basic part of honest presentation, but few people do it, which is a pity.

\bigskip
\noindent{2.4. PRIOR PROBABILITIES}
\smallskip

Before we even start to use the data, we need to assign a prior probability distribution $\pi(\theta)=\Pr(\theta\mid I)$ to everything relevant that we don't know.
I sometimes remark that I don't know what I am talking about --- until I have defined a prior.  
We have complete freedom to set $\pi$, subject only to it being non-negative and summing to unity (so that it can actually {\bf be} a probability distribution).  
In practice, we will assign $\pi$ by using such desiderata as symmetry, simplicity, reasonable behaviour, perhaps maximum entropy, according to our experience and judgment. 
We can also use the posterior distribution from previous observations as our current prior.
(That's what ``doing inference consistently'' means.  
It doesn't matter whether we use dataset $D_1$ first then $D_2$, or the other way round, or use them both together.  
Each procedure yields the same posterior.)
Yes, the assignment of prior is subjective.  That's the way it is!
But, after acquiring data, we can compare evidence values, so the subjective assumptions can be objectively compared.

In my illustrative population example, we need to assign a prior over the four non-negative populations.  
In fact, we may as well split the British Isles into arbitrary fractions $f_i$ instead of restricting to four quarters.  
One choice (Ferguson 1973), which can be used at any resolution and is quite often suggested (though seldom by me nowadays), is a gamma distribution
$$
  \pi(\theta) \propto \prod \theta_i^{-1 + c f_i} e^{-\alpha \theta_i}
$$
where $\alpha$ and $c$ are constants, or a Dirichlet distribution if $\sum\theta$ is fixed.  
Priors nearly always involve constants like these, grandiosely called ``hyper-parameters'' to distinguish them from ordinary parameters $\theta$. 
Interpreting them and setting them appropriately is part of the art.

Given the success of maximum entropy in regularisation, it is tempting to try
$$
  \pi(\theta) \propto
         \exp( - \theta_1 \log\theta_1 - \theta_2 \log\theta_2
               - \theta_3 \log\theta_3 - \theta_4 \log\theta_4 )
$$
or some close variant.  However, integration shows that the implied prior on the total population $\Theta = \theta_1 + \theta_2 + \theta_3 + \theta_4$ 
is nothing like $\exp( - \Theta \log\Theta)$, whereas the gamma distribution would have remained intact, albeit with quadrupled $c$. 
Worse, the entropy prior cannot be subdivided.
There is no distribution $p(\cdot)$ which could be applied independently to northern and southern England and then integrated to give a prior 
like $\exp( - \theta \log\theta )$ for the population of England as a whole.
Entropy is a good regulariser but, like most functions, it does not translate into a good prior.
(Assigning $\pi$ by maximising entropy $-\int\pi\log\pi\,d\theta$ under linear constraints is different, and legitimate.)

\bigskip
\noindent{2.4.1. \it Atomic priors}
\smallskip

One useful way of breaking down the complexity of a problem is to construct the object as some number $n$ of {\it a-priori-}equivalent ``atoms'', 
which are scattered randomly with positions $x$ over the domain of interest.  
These positions determine whatever extra attributes are needed to model the object. 
Statisticians call the atoms in an atomic prior the ``components of a mixture distribution'' (Titterington, Smith and Makov 1985), but I prefer my own terminology. 
In the population example, an atom might represent a person, in which case those particular data would require 10000 atoms. 
Or an atom might represent a census unit of 500 people, in which case only 20 would be needed.  
Or an atom might represent a tribe whose size was drawn from some distribution such as an exponential: 
this would often be better because of the flexibility involved, and might require even fewer atoms.  
Whatever an atom represents, and however many there are, letting them fall randomly over the domain ensures that we can compute at arbitrary resolution.

Atomic priors give a structure-based description.  
The number of atoms is usually allowed to vary, and the computer only needs to deal with the amount of structure that is actually required.
Spatial resolution, in the form of accuracy of location of the atoms, is ``free'', 
because each atom is automatically held to the arithmetical precision of the hardware.  
Algebraically-continuous priors like the gamma distribution, by contrast, give a cell-based description.
To use them, we have to divide the spatial domain into as many cells as we are ever likely to need, and be prepared to compute them all.
Resolution is directly limited by the computer memory and processor time.
This comparison between atomic and continuous priors is somewhat analogous to the comparison between a Monte Carlo representation by samples 
and a full covering of the entire space. 
In each case, the former is practical, while the latter is likely not.  
There is no loss of generality.
If required, we could let the whole object be a single atom having attributes for every cell, which would reproduce a cell-based prior.

To define an atomic prior, we assign distributions for the number $n$ of atoms, and for their attributes.  
Typical priors for $n$ are uniform
$$
  \pi(n) = \hbox{constant}
$$
between a minimum and maximum number (whose equality would fix $n$ at that value), or Poisson
$$
  \pi(n) = e^{-\alpha} \alpha^n / n!
$$
(binomial if a maximum is imposed), or geometric
$$
  \pi(n) = (1-c)c^n\quad \hbox{with $c < 1$}
$$
(which is wider than the Poisson).
As a technical note, only the Poisson assignment is ``infinitely divisible'', meaning that it can be accumulated from arbitrarily small but 
fully independent subdivisions of the domain --- remember that Poisson distributions combine into another Poisson distribution with summed mean.  
If computed at small scale, the other forms need small correlations to make the total number correctly distributed.  
I don't think that that matters at all.  
Indeed, I usually prefer the less-committal geometric assignment.

\bigskip
\noindent{2.4.2. \it Coordinates}
\smallskip

As for the attribute coordinates, their priors will depend on what the attributes are.
Importantly, there are many applications where an atom has very few attributes, 
such as image reconstruction where an atom has only position $(x,y)$, brightness, and possibly shape.
It is then much easier to find acceptable and useful new attributes for an atom than it would be for the object as a whole, 
simply because of the huge reduction in dimensionality.
({\it Divide and conquer}, as the slogan has it.)

A common case is where an attribute measures an additive quantity $z$ such as population number, power of signal, brightness of radiation, or flux of material.
There seems to be no science-wide term for additive quantities: I use the crisp term ``flux'', reflecting my training in astronomy.
For such fluxes, an exponential distribution is often convenient;
$$
  \pi(z) = c\,e^{-cz}, \quad \hbox{$c$ = constant.}
$$
\bigskip
We can now give Bayesian solutions to the population example.
Whatever prior we take, the likelihood function (as expressed in terms of the Irish population $\theta_4 \equiv x$) is
$$
 \Pr(D \mid \theta) = \delta(5500 + x - \theta_1) \,\delta(2500 - x - \theta_2) \,\delta(2000 - x - \theta_3) \quad [{\rm people}]^{-3}\,.
$$
Note the dimensions: the three measurements are each in units of people.
The simplest Bayesian prior is just constant, over whatever range might be deemed adequate, for example
$$
 \Pr(\theta) = 10^{-16}\ [{\rm people}]^{-4} \quad  \hbox{over $0 < \theta_i < 10000$ for each $i=1,2,3,4$.}
$$
The usual Bayesian machinery
$$
 {\rm Prior} \times {\rm Likelihood}\ =\ {\rm Joint}\ \Rightarrow\ {\rm Evidence}\ \Rightarrow\ {\rm Posterior}
$$
yields an evidence $\Pr(D) = 2\times 10^{-13}\ [{\rm people}]^{-3}$,
with flat posterior $Pr(x \mid D) = 1/2000$ in $0 < x < 2000$ for the Irish population.

A more sophisticated prior supposes that people are distributed in tribes with mean $\mu$ (say 2) per country,
each tribe having an exponential distribution of people with mean $q$ (say 1000).
Anticipating the algebraic result of section 6.1, this translates to
$$
 \Pr(\theta) = \prod_{i=1}^4 e^{-\mu} \big( \delta(\theta_i) + e^{-\theta_i/q} \sqrt{\mu / q \theta_i} \,I_1(2\sqrt{\mu \theta_i / q})\big)
$$
in terms of the four populations, $I_1$ being the modified Bessel function.
The evidence is now
$$
 \Pr(D) = 6.9\times 10^{-13}\ [{\rm people}]^{-3}, 
$$
which is over three times larger than before, whilst the posterior is composite with
\halign
{\quad  #                                                                                                    \hfill \cr
\hbox{(a) a 19\% chance that the Irish population is $x=0$ (because no tribes happen to inhabit Ireland);}   \cr
\hbox{(b) a 11\% chance that the Welsh population is  0 instead (so that $x = 2000$, the maximum allowed);}  \cr
\hbox{(c) a 70\% chance that $0 < x < 2000$, with a flattish distribution having most probable value near 400.}\cr
}
\noindent Whether or not you favour the more sophisticated prior because of its greater evidence 
depends also on the relative plausibilities you presumably had in mind in the first place for the two models.

\hfill\eject
The population example was solvable algebraically, but usually numerical calculations are needed.
For numerical work, I suggest forcing all coordinates to lie in [0,1], with uniform prior.
In one dimension, this is easy --- just replace the original coordinate $z$ by the cumulant $x = \int_0^z \pi(z) dz$.
Whatever the number ($d$) of attributes of an atom, though,
it is always possible to squash the original prior into uniformity $\pi(x) = 1$ over the unit hypercube $[0,1]^d$.
I recommend this discipline: there is no loss of generality and numerical exploration is likely to be easier.
Also, there is no possibility of using an ``improper'' (non-normalised) prior, through assigning an infinite range or other pathology.
Uniformity over the unit hypercube enforces proper behaviour, and is required by BayeSys.

\bigskip
\noindent{2.5. SAMPLING}
\smallskip

Only when an object $\theta$ has very few degrees of freedom can we hope to explore ``all'' $\theta$ to find the posterior ``everywhere''. 
Instead, the modern approach is to characterise the posterior in a Monte Carlo sense, by taking a dozen or more random-sample objects $\tilde\theta$ from it.

It is fortunate but true that these dozen samples will very likely answer any particular question about $\theta$ with adequate precision.
The point is that each sample object yields an independent value $\tilde Q = Q(\tilde\theta)$ for a scalar quantity $Q$ being sought.
Only occasionally will these be badly biassed overall.  
For example, there is only about a 1 in 2000 chance that their average $\langle \tilde Q \rangle$ will be more than one standard deviation from the true mean 
of $Q$ (for Gaussian statistics), and this chance drops sharply as more samples are taken.

Note that other ways of selecting a single $\theta$ to represent the posterior have serious defects.  
Suppose for instance that the posterior requires $\theta$ to lie on a circle. 
Then the mean (a popular candidate for presentation) will lie inside the circle, which is supposed to be prohibited!
It also moves if $\theta$ is re-parameterised.
The median (another plausible choice)is predicated upon being able to order the values of $\theta$, which really only makes sense if $\theta$ is restricted to one dimension.
The mode, obtained by maximising the posterior (in a method sometimes glorified with the acronym MAPP for maximum {\it a-posteriori} probability), 
also moves if $\theta$ is re-parameterised, because squeezing $\theta$ somewhere increases the posterior there to compensate.  
Hence the mode lacks an invariance we may want.  
Moreover, with a flat prior, MAPP reduces to maximum likelihood (ML), which is non-unique whenever a problem is under-constrained.
Another dastardly counter-example to MAPP estimation has already been found in the population example.
Under the ``tribal'' prior, the MAPP estimate of the Irish population is exactly 0 because of the extreme height of the delta function there, 
even though $0 < x < 2000$ is more than 3 times more likely.
Indeed, it's only because the mythical population data were assigned perfect reliability that the MAPP estimate did not collapse to zero in all four countries.

In fact, the dozen or more sample objects $\tilde\theta$ seem to be the only faithful representation that is generally accessible.
As a practical matter, representing the posterior by these samples almost forces one to use all one's data in the computation.
Using part of the dataset first, and then using the rest later, 
will be next to impossible if the posterior is compressed to a limited list of samples at the intermediate stage.
\vfill\eject

\centerline{\bigger PART 2. THEORY}
\bigskip
\noindent{$\underline{\hbox{\bf{Section 3. Markov chain Monte Carlo (MCMC)}}}$}
\bigskip

Today, Markov chain Monte Carlo algorithms are the preferred method for practical inference.
The only generally faithful way of representing the posterior distribution is through sampling, which suggests a Monte Carlo method.
And the posterior can only be reached in practice through a Markov chain of intermediate states, each learning from the previous one(s).
Hence MCMC.
The method is attributed to Metropolis {\it et.al.} (1953), as re-worked with more generality by Hastings (1970).

Let our object $\theta$ have available states $i = 1,2,\cdots\,$, our prior knowledge being probabilistic: $\Pr(i) = \pi_i$.
A MCMC algorithm for exploring the states is identified by the transitions {\bf T} that it can make, with their probabilities
$$
    \hbox{$i \rightarrow j$ with probability $T_{ji} = \Pr( j \mid i )$}.
$$
Technically, a Markov transition could also involve a memory of earlier states.  
However, transitions may (and sometimes do) already involve a sophisticated history of intermediate trial or actual states, 
so the restriction is more apparent than real.

Repeatedly applying the algorithm to an arbitrary initial probability assignment {\bf p} 
will yield ${\bf p} \rightarrow {\bf T}{\bf p} \rightarrow {\bf T}^2{\bf p} \rightarrow {\bf T}^3{\bf p} \rightarrow \cdots\,$.
Eventually, {\bf p} will converge to the principal eigenvector of {\bf T} with greatest eigenvalue
(which, because the components of {\bf p} always sum to the same unit total, must be 1).
Provided the algorithm is aperiodic (so that it doesn't just bounce) and ``irreducible'' (so that every state is eventually accessible from any other),
this principal eigenvector is unique (Smith \& Roberts 1993, and Roberts \& Smith 1994).
Elements of randomness soon ensure that an algorithm is aperiodic, 
and if it happens that chains of successive {\bf p} reduce into disjoint un-mixed domains, 
we will include extra transition routes to join all such domains.
Hence it is usually straightforward to satisfy these conditions.
We will start by designing algorithms which target the prior $\pi_i$, for which we need a transition matrix whose principal eigenvector is \bfpi.
Eigenvectors are somewhat remote from the components of a matrix, being expensive to compute.  
However, any matrix with the property of ``{\bf detailed balance}''
$$
    T_{ji} / T_{ij} = \pi_j / \pi_i \quad \hbox{for all $i$, $j$}
$$
will suffice.
Thinking physically, we see that if we start correctly with $\pi_i$ objects in state $i$ and $\pi_j$ in state~$j$, 
then on applying {\bf T} the same number ($T_{ji}\pi_i$) will pass from $i$ to $j$ as pass from $j$ to $i$ ($T_{ij}\pi_j$), leaving the correct assignment intact.
$$
\hbox{
       \vbox{\offinterlineskip
             \hrule
             \halign{&\vrule#&\strut\ \ \hfil#\hfil\ &\vrule#\cr
                     height4pt & \omit && \omit \cr
                               &  $i$  &  \cr
                     height4pt & \omit && \omit \cr
                    }
             \hrule
            }
       \vbox{ \hbox{${\buildrel         T_{ji}\pi_i            \over      {\hbox to 50pt{\rightarrowfill}}}      $}
              \hbox{${\buildrel {\displaystyle{\hbox to 50pt{\leftarrowfill}}} \over {\scriptstyle{T_{ij}\pi_j}}}$} }
       \vbox{\offinterlineskip
             \hrule
             \halign{&\vrule#&\strut\ \ \hfil#\hfil\ &\vrule#\cr
                     height4pt & \omit && \omit \cr
                               &  $j$  &  \cr
                     height4pt & \omit && \omit \cr
                    }
             \hrule
            }
     }
$$
The corresponding algebraic proof is $ ({\bf T}\bfpi)_j = \sum_i T_{ji} \pi_i = \sum_i T_{ij} \pi_j = (\sum_i T_{ij}) \pi_j = \pi_j$.
Although not strictly necessary, detailed balance is the key to constructing useful transition matrices.  
We start with algorithms that explore the prior faithfully, leaving the extra complication of likelihood factors until later.

\bigskip
\noindent{3.1. THE NUMBER OF ATOMS}
\smallskip

Typical priors $\Pr(n)$ for the number of atoms are
$$
 \pi(n) = \left\{\ \matrix{
                      1/N\,, \hfill & \hbox{for $0 \le n < N$} \hfill & \hbox{(uniform);}   \hfill \cr
e^{-\alpha} \alpha^n / n!\,, \hfill & \hbox{for $n \ge 0$}     \hfill & \hbox{(Poisson);}   \hfill \cr
                 (1-c)c^n\,, \hfill & \hbox{for $n \ge 0$}     \hfill & \hbox{(geometric).} \hfill \cr
                }\right.
$$
and we seek an algorithm that faithfully targets any such prior.
Actually, the BayeSys program imposes a minimum value for $n$, which we ignore here for clarity. 
The natural unit of change is just one atom at a time, so the only non-zero transitions $T_{ji}$ will be $j=i+1$ (``birth'' of an atom),
$j=i-1$ (``death'' of an atom), and $j=i$ (no change).
Multiple births or deaths, as implied by the general treatment of jump-diffusion by Grenander and Miller (1994) or the reversible-jump dynamics of Green (1994),
are likely to be less often acceptable, so I restrict attention to single birth or death.
Detailed balance fixes the ratios between birth and death rates but leaves their overall magnitudes free.

I choose to let each atom decay with unit mean lifetime, so that the death rate is set as
$$
    T_{n-1,n} = n\,dt
$$
in infinitesimal interval $dt$ of artificial time.
The rationale for this is that most of the atoms will have been changed after unit time.
Regular ${\cal O}(1)$ timesteps thus give a natural sampling period $\tau$ for our multi-atom object.
Sampling more frequently would make successive objects tediously similar, whereas sampling less often might waste useful intermediate samples.
Detailed balance implies a corresponding birth rate
$$
    T_{n+1,n} = \beta_n\,dt\,, \qquad \beta_n = (n+1)\, \pi_{n+1} / \pi_n.
$$
For the three typical priors, these birth rates are
$$
 \beta_n = \Biggl\{\matrix{
             n+1, \qquad \hbox{for $n < N-1$, else 0} & \hbox{(uniform);}   \hfill \cr
             \alpha,   \hfill                         & \hbox{(Poisson);}   \hfill \cr
             (n+1)\,c, \hfill                         & \hbox{(geometric).} \hfill \cr
                                    }
$$
Starting with (say) $n$ atoms, the time to the next event is exponentially distributed
$$
    \Pr(\Delta t) = (\beta_n + n)\,e^{-(\beta_n + n)\,\Delta t}
$$
and when that event occurs, it is either birth or death in ratio $\beta_n\!:\!n$.
Thus, in the following example, samples are taken at uniform times $\ldots,5\tau, 6\tau, 7\tau, 8\tau, \ldots$ when $n$ happens to be $\ldots,3,3,2,2,\ldots\,$.

\centerline{
\vbox{
      \vskip 6pt
      \hbox{  \ \ \vbox{ \hbox{     } \hbox{ $n=3$                  } }\ \ \ \ \ 
                  \vbox{ \hbox{Birth} \hbox{\ \ \ $\big\uparrow$} }
                  \vbox{ \hbox{     } \hbox{ $n=4$                  } }
                  \vbox{ \hbox{Death} \hbox{\ \ \ $\big\downarrow$} }
          \ \ \ \ \vbox{ \hbox{     } \hbox{ $n=3$                  } }\ \ \ \ 
                  \vbox{ \hbox{Death} \hbox{\ \ \ $\big\downarrow$} }
\ \ \ \ \ \ \ \ \ \vbox{ \hbox{     } \hbox{ $n=2$                  } }\ \ \ \ \ \ \ \ \ 
                  \vbox{ \hbox{Birth} \hbox{\ \ \ $\big\uparrow$} }
              \ \ \vbox{ \hbox{     } \hbox{ $n=3$                  } }
           }
      \hrule
      \hbox{ \hskip 50pt
             \vbox{ \hbox{$\,|$}\hbox{$5\tau$}  }
             \hskip 80pt
             \vbox{ \hbox{$\,|$}\hbox{$6\tau$} }
             \hskip 80pt
             \vbox{ \hbox{$\,|$}\hbox{$7\tau$} }
             \hskip 80pt
             \vbox{ \hbox{$\,|$}\hbox{$8\tau\qquad\rightarrow$\ time} }
           }
      \vskip 6pt
     }
}
\noindent Phillips \& Smith (1996) review dimension-changing methods like this at a more abstract level.

\bigskip
\noindent{3.2. COORDINATES}
\smallskip

As recommended above, we require the prior to be uniform $\pi(x)=1$ over the unit hypercube $[0,1]^d$ for coordinates $x$.
Within the computer, of course, coordinates will come from a finite set determined by the finite precision of the hardware.
This observation suggests a ``modern digital'' style of treatment.
My convention, taking the computer word length to be $W$ bits (usually 32) is to let the available values be odd multiples of $2^{-(W+1)}$, 
labelled by {\tt unsigned} ({\it i.e.} non-negative) integers $k$ from 0 to $2^W - 1$.
$$
  x = 2^{-W}(k + \hbox{$1\over2$})
$$
The rules of integer arithmetic (modulo $2^W$) ensure that $x$ is wraparound continuous, which is never harmful and sometimes appropriate.
This makes all states {\it a-priori-}equivalent geometrically as well as by prior measure, independently of any concern over the vagaries of floating-point representations.  
Helpfully, the states avoid the boundaries $x=0$ and $x=1$ and centre $x={1\over2}$, which might be special in an application.

When $x$ has $d$ dimensions (more than one), we can fill the unit hypercube with a space-filling Hilbert curve (Hilbert 1891, Sagan 1994),
thus reducing the topology to a one-dimensional coordinate along the curve.
The hypercube contains $(2^W)^d$ digital points, so each point along the curve is labelled by an integer with $Wd$ bits, or $d$ words.
The diagrams below show Hilbert curves in two dimensions.

A Hilbert curve can be constructed recursively from its generator (Butz 1969, 1971).  
At the top level the generator is a path around the $2^d$ corners of a hypercube, using segments directed parallel to the axes.  
In computer parlance, this path is a ``Gray code'' for $d$-bit integers.
At the next (second) level, smaller copies of each generator are placed at each first-level point, oriented to keep the ends of successive generators adjacent.
There are now $2^{2d}$ points along the path.
At the third level, yet smaller copies of each generator are placed at each second-level point, again oriented to keep the ends of successive generators adjacent.
By now, the path has $2^{3d}$ points.

$$
\vbox{\lineskip = -0.4pt \baselineskip = 0pt
      \hbox{\Za\Oa}
      \hbox{\Ia\Ia}
     }
\hskip -56pt
\vbox{\lineskip = -0.4pt \baselineskip = 0pt
      \hbox{\Zb\Ob\Zb\Ob}
      \hbox{\Ib\Lb\Ib\Ib}
      \hbox{\Lb\Ob\Zb\Ib}
      \hbox{\Zb\Ib\Lb\Ob}
     }
\vbox{\lineskip = -0.4pt \baselineskip = 0pt
      \hbox{\Zc\Oc\Zc\Oc\Zc\Oc\Zc\Oc}
      \hbox{\Ic\Lc\Ic\Ic\Ic\Lc\Ic\Ic}
      \hbox{\Lc\Oc\Zc\Ic\Lc\Oc\Zc\Ic}
      \hbox{\Zc\Ic\Lc\Zc\Zc\Ic\Lc\Oc}
      \hbox{\Ic\Zc\Zc\Oc\Zc\Zc\Oc\Ic}
      \hbox{\Lc\Ic\Zc\Ic\Lc\Oc\Lc\Ic}
      \hbox{\Zc\Oc\Lc\Oc\Zc\Ic\Zc\Oc}
      \hbox{\Ic\Lc\Zc\Ic\Lc\Zc\Ic\Ic}
     }
$$
And so on, at finer and finer scales until all $Wd$ bits have been used.
Each stage adds resolution without changing the existing larger-scale pattern: with 4-bit axes $0,1,\ldots,15$ there are $16^d = 16^2 = 256$ points.

$$
\hbox{
{\vrule height 0.4pt width 6pt depth 0pt} \hskip -4pt
\vbox{\lineskip = -0.4pt \baselineskip = 0pt
      \hbox{\Zd\Od\Zd\Od\Zd\Od\Zd\Od\Zd\Od\Zd\Od\Zd\Od\Zd\Od}
      \hbox{\Id\Ld\Id\Id\Id\Ld\Id\Id\Id\Ld\Id\Id\Id\Ld\Id\Id}
      \hbox{\Ld\Od\Zd\Id\Ld\Od\Zd\Id\Ld\Od\Zd\Id\Ld\Od\Zd\Id}
      \hbox{\Zd\Id\Ld\Zd\Zd\Id\Ld\Od\Zd\Id\Ld\Zd\Zd\Id\Ld\Od}
      \hbox{\Id\Zd\Zd\Od\Zd\Zd\Od\Id\Id\Zd\Zd\Od\Zd\Zd\Od\Id}
      \hbox{\Ld\Id\Zd\Id\Ld\Od\Ld\Id\Ld\Id\Zd\Id\Ld\Od\Ld\Id}
      \hbox{\Zd\Od\Ld\Od\Zd\Id\Zd\Od\Zd\Od\Ld\Od\Zd\Id\Zd\Od}
      \hbox{\Id\Ld\Zd\Id\Ld\Zd\Id\Ld\Id\Ld\Zd\Id\Ld\Zd\Id\Id}
      \hbox{\Ld\Od\Zd\Zd\Od\Zd\Zd\Od\Zd\Zd\Od\Zd\Zd\Od\Zd\Id}
      \hbox{\Zd\Id\Ld\Od\Ld\Id\Zd\Id\Ld\Od\Ld\Id\Zd\Id\Ld\Od}
      \hbox{\Id\Zd\Od\Id\Zd\Od\Ld\Od\Zd\Id\Zd\Od\Id\Zd\Od\Id}
      \hbox{\Ld\Id\Ld\Id\Id\Ld\Zd\Id\Ld\Zd\Id\Id\Ld\Id\Ld\Id}
      \hbox{\Zd\Od\Zd\Od\Id\Zd\Zd\Od\Zd\Zd\Od\Id\Zd\Od\Zd\Od}
      \hbox{\Id\Ld\Id\Id\Ld\Id\Zd\Id\Ld\Od\Ld\Id\Id\Ld\Id\Id}
      \hbox{\Ld\Od\Zd\Id\Zd\Od\Ld\Od\Zd\Id\Zd\Od\Ld\Od\Zd\Id}
      \hbox{\Zd\Id\Ld\Zd\Id\Ld\Zd\Id\Ld\Zd\Id\Ld\Zd\Id\Ld\Od}
     }
\hskip -9pt {\vrule height 0.4pt width 6pt depth 0pt}
     }
$$
The algorithm I actually use (Skilling 2004a) works rather differently, by decoding a $Wd$-bit Hilbert integer as the corner of a $Wd$-dimensional hypercube,
and then rastering back through the bits to undo the extra work that was implicitly done in that larger (but quick because just a Gray code) transform.
The final effect is just the same, but the program is smaller and faster than top-down code such as that of Lawder (2000).

Obviously, we are not getting something for nothing here.
Yes, we are turning $d$ numbers into one, but that one number has to have $d$ times the precision of its constituents.
The procedure only seems paradoxical if one takes the mystical view of a ``number'' as something of infinite precision with the limit already accomplished.
In reality, our computers are finite.  $W=32$.

A Hilbert curve preserves locality, meaning that contiguity along the line implies contiguity in space (though not conversely).  
In fact, the Hilbert curve has the greatest degree of locality possible for a space-filling curve.
It follows from locality that a Hilbert curve also preserves continuity, 
meaning that a function that is continuous in space must also be continuous along the line (though not conversely).
Differentiability, though, is not preserved.
Even so, there are useful methods for one-dimensional exploration which can immediately be applied in $d$ dimensions,
by invoking extended-precision integer arithmetic.

A length of line that occupies a fraction $f$ of the total necessarily fills that same fraction $f$ of the hypercube volume.
The winding pattern spirals out very roughly isotropically, so that each coordinate ranges over something like $f^{\,1/d}$ as the length is traversed.
Winding a one-dimensional line into a $d$-dimensional volume, though, necessarily places some points that are close in space far apart on the line.
With the very precise and binary Hilbert pattern, there is at the extreme only one crossing of the central line $x_1 = {1\over2}$ (vertical in the diagrams), 
so that points on opposite sides of that line are mostly far apart on the Hilbert curve.
Similarly, there is only one crossing of the (wraparound continuous) abscissa $x_1 = 0$, and none at all of the horizontal baseline $x_2 = 0$.
These barriers to easy spatial movement are made harmless by moving them around after each observation interval $\tau$, 
re-randomising the origin of the Hilbert curve within the hypercube and re-permuting its orientation by taking the geometrical axes in arbitrary order.

One-dimensional representation along a line implies an ordering of locations, so that an atom's neighbours can be quickly identified by keeping a linked list.
Neighbouring atoms along the line may not be quite the closest in space, 
but they still turn out to be useful because they identify pairs of atoms that may be similarly related to the data, 
and whose behaviour may thereby be correlated.
In order to keep the ordering of atoms unambiguous, I accept a restriction that not more than one atom may occupy a single point.
There is a huge number of points, so this is only a technicality.

$$
\hbox{
{\vrule height 0.3pt width 6pt depth 0pt} 
\hskip -4pt
\vbox{\lineskip = -0.4pt \baselineskip = 0pt
      \hbox{\zd\od\zd\od\zd\od\zd\od\zd\od\zd\od\zd\od\zd\od}
      \hbox{\id\ld\id\id\id\ld\id\id\id\ld\id\id\id\ld\id\id}
      \hbox{\ld\od\zd\id\ld\od\zd\id\ld\od\zd\id\ld\od\zd\id}
      \hbox{\zd\id\ld\zd\zd\id\ld\od\zd\id\ld\zd\zd\id\ld\od}
      \hbox{\id\zd\zd\od\zd\zd\od\id\id\zd\zd\od\zd\zd\od\id}
      \hbox{\ld\id\zd\id\ld\od\ld\id\ld\id\zd\id\ld\od\ld\id}
      \hbox{\zd\od\ld\od\zd\id\zd\od\zd\od\ld\od\zd\id\zd\od}
      \hbox{\id\ld\zd\id\ld\zd\id\ld\id\ld\zd\id\ld\zd\id\id}
      \hbox{\ld\od\zd\zd\od\zd\zd\od\zd\zd\od\zd\zd\od\zd\id}
      \hbox{\zd\id\ld\od\ld\id\zd\id\ld\od\ld\id\zd\id\ld\od}
      \hbox{\id\zd\od\id\zd\od\ld\od\zd\id\zd\od\id\zd\od\id}
      \hbox{\ld\id\ld\id\id\ld\zd\id\ld\zd\id\id\ld\id\ld\id}
      \hbox{\zd\od\zd\od\id\zd\zd\od\zd\zd\od\id\zd\od\zd\od}
      \hbox{\id\ld\id\id\ld\id\zd\id\ld\od\ld\id\id\ld\id\id}
      \hbox{\ld\od\zd\id\zd\od\ld\od\zd\id\zd\od\ld\od\zd\id}
      \hbox{\zd\id\ld\zd\id\ld\zd\id\ld\zd\id\ld\zd\id\ld\od}
     }
\hskip -11pt {\vrule height 0.3pt width 7pt depth 0pt}
\hskip -117pt
\lower 2pt\vbox{\lineskip = -0.4pt \baselineskip = 0pt
      \hbox{\oo\oo\oo\oo\oo\oo\oo\oo\oo\oo\oo\oo\oo\oo\oo\oo}
      \hbox{\oo}
      \hbox{\oo\oo\oo\oo\oo\oo\oo\oo\oo\oo\oo $\bullet$}
      \hbox{\oo\oo\oo\oo\oo\oo\oo\oo\oo $\bullet$}
      \hbox{\oo\oo\oo\oo\oo\oo\oo $\bullet$}
      \hbox{\oo\oo\oo\oo\oo\oo\oo\oo\oo $\bullet$}
      \hbox{\oo}
      \hbox{\oo}
      \hbox{\oo}
      \hbox{\oo}
      \hbox{\oo\oo\oo\oo $\bullet$}
      \hbox{\oo\oo\oo\oo\oo\oo $\bullet$}
      \hbox{\oo\oo\oo\oo\oo $\bullet$}
      \hbox{\oo\oo\oo\oo\oo\oo\oo $\bullet$}
      \hbox{\oo}
      \hbox{\oo}
     }
     }
$$

\bigskip
\noindent{3.3. ROLE OF LIKELIHOOD}
\smallskip

We have, so far, discussed the form of the prior, 
and settled on a flat prior over the unit hypercube for the attributes of each of a variable number of atoms.
For exploration of the prior by MCMC methods, we have introduced birth and death rates for changing the number of atoms,
and movement along a Hilbert curve to change their positions.
It is now time to introduce the data.

To correct a MCMC algorithm and make it converge to the required posterior instead of merely to the prior, its transition probabilities need to be adjusted.
Suppose that transitions are performed, not definitively, but probabilistically according to acceptance probabilities $A$.
This will reduce an effective $j \leftarrow i$ transition rate from $T_{ji}$ to $A_{ji} T_{ji}$.
Detailed balance should conform to the posterior $L(\theta)\pi(\theta)$, so we require
$$
    {A_{ji} T_{ji} \over A_{ij} T_{ij}} =
    {L(\theta_j)\pi(\theta_j) \over L(\theta_i)\pi(\theta_i)}.
$$
Because the basic transition scheme $T$ will already conform to the prior $\big(T_{ji}/T_{ij} = \pi(\theta_j)/\pi(\theta_i)\big)$,
the acceptance probabilities must simply be in ratio of the likelihoods.
$$
    {A_{ji} \over A_{ij}} = {L(\theta_j) \over L(\theta_i)}
$$
To avoid wasting resources, we want to accept as often as possible, so we set whichever acceptance would be larger to the largest possible probability value of 1.
Hence
$$
    A_{ji} = \min \left( 1 ,\, {L(\theta_j) \over L(\theta_i)} \right)\,,
$$
implying the consistent backwards acceptance $A_{ij} = \min \left( 1 ,\, L(\theta_i) / L(\theta_j) \right)$, equivalent to the rule:
$$
  \hbox{``Accept transition $\ j \leftarrow i\ $ if and only if $\ L(j) \ge {\tt Uniform}\big(0, L(i)\big)$''}\,,
$$
where {\tt Uniform} means a random sample from the uniform distribution over the quoted range.
Although delightfully simple, this method (Metropolis {\it et.al.} 1953, Hastings 1970) can be inefficient.
The worst difficulty lies in how to choose the magnitude of change $\delta\theta$ in the transitions.
If $\delta\theta$ is set too small, diffusion of position will be un-necessarily (and quadratically) slow.
If $\delta\theta$ is set too large, nearly every proposal will be rejected, and the procedure effectively stops --- without having converged.
We need a way of ensuring that $\delta\theta$ has the correct scale.

Incidentally, it is now apparent why attention is universally focussed on pair-wise transitions rather than on longer cycles
$$
\matrix{
       \lower 10pt\vbox{\offinterlineskip
             \hrule
             \halign{&\vrule#&\strut\ \ \hfil#\hfil\ &\vrule#\cr
                     height4pt & \omit && \omit \cr
                               &  $i$  &  \cr
                     height4pt & \omit && \omit \cr
                    }
             \hrule
            }
       & \!\!\!\!\!\!\!\rightarrow\!\!\!\!\!\!\!
       & \lower 10pt\vbox{\offinterlineskip
               \hrule
               \halign{&\vrule#&\strut\ \ \hfil#\hfil\ &\vrule#\cr
                       height4pt & \omit && \omit \cr
                                 &  $j$  &  \cr
                       height4pt & \omit && \omit \cr
                      }
               \hrule
              }
       \cr
       \uparrow & & \downarrow \cr
       \lower 10pt\vbox{\offinterlineskip
             \hrule
             \halign{&\vrule#&\strut\ \ \hfil#\hfil\ &\vrule#\cr
                     height4pt & \omit && \omit \cr
                               &  $l$  &  \cr
                     height4pt & \omit && \omit \cr
                    }
             \hrule
            }
       & \!\!\!\!\!\!\!\leftarrow\!\!\!\!\!\!\!
       & \lower 10pt\vbox{\offinterlineskip
               \hrule
               \halign{&\vrule#&\strut\ \ \hfil#\hfil\ &\vrule#\cr
                       height4pt & \omit && \omit \cr
                                 &  $k$  &  \cr
                       height4pt & \omit && \omit \cr
                      }
               \hrule
              }
       \cr
     }
$$
that might seem to offer systematic exploration instead of diffusion.
The flow of samples around such a cycle is limited by the relative probability of its least likely state.
With larger cycles, this will be an ever-smaller fraction of the highest probability where most of the samples will reside, 
so the flow decreases and the cycle loses efficiency.
The smallest non-trivial cycle (2 members only) is best.

\bigskip
\noindent{3.4. BINARY SLICE SAMPLING}
\smallskip

In a 1-dimensional representation, the obvious ``analogue'' transition scheme is to select $\delta x$ at some appropriate scale (to be assigned somehow), 
and then use it to either increment or decrement $x$ within the $[0,1]$ window.
$$
    x\ \leftarrow\ x \pm \delta x \pmod 1
$$
In the underlying integer representation, with modulo $2^W$ arithmetic understood,
$$
    k\ \leftarrow\ k \pm \delta k\,.
$$
With increment and decrement being equi-probable, the algorithm is clearly in detailed balance over a uniform prior, no matter how $\delta k$ was set.
An alternative ``digital'' transition scheme is to decide on some number $b$ up to $W$, and then randomise the low order $b$ bits of $k$.
$$
    k\ \leftarrow\ k \oplus {\tt Uniform}[\,0,2^b)
$$
where ``$\oplus$'' represents binary exclusive-or, here with a random integer less than $2^b$.  
Extra randomisation at the same scale is achieved by taking the coordinate relative to some non-zero origin $o\,$.
$$
    k\ \leftarrow\ \big(\,(k - o) \oplus {\tt Uniform}[\,0,2^b)\,\big) + o
$$
This idea underlies binary slice sampling.

Let the initial state, represented by a $B$-bit integer $k$ from domain~${\cal D}_0$ (usually the full Hilbert line), have likelihood $L(k)$.
Set an acceptance level 
$$
    a \in \hbox{\tt Uniform}(0,L(k))
$$
and randomise all $B$ bits of $k$ to reach
$$
    j_0 = k \oplus {\tt Uniform}[\,0,2^B)
$$
in domain ${\cal D}_0$.
According to Metropolis-Hastings, we are entitled to accept any new trial state $j$ whose likelihood exceeds $a$, 
provided it was generated symmetrically with the reverse transition $j \rightarrow k$ being just as probable as the forwards $k \rightarrow j$.
Randomisation was indeed symmetric, so if the likelihood for $j_0$ is greater than $a$, accept it.

If not, halve the domain size to ${\cal D}_1 \subset {\cal D}_0$ by randomising only the lowest $B-1$ bits,
$$
    j_1 = k \oplus {\tt Uniform}[\,0,2^{B-1}) ,
$$
keeping the top bit intact.
This transition too is symmetric because both $k$ and $j_1$ lie in the same domain ${\cal D}_1$, 
so $j_1 \rightarrow k$ is just as likely as $k \rightarrow j_1$ (with probability $1/2^{B-1}$ as it happens).
So, if the likelihood for $j_1$ is greater than $a$, accept it.

If not, halve the domain size again to ${\cal D}_2$ by randomising only the lowest $B-2$ bits,
$$
    j_2 = k \oplus {\tt Uniform}[\,0,2^{B-2}) .
$$
Again, this transition is symmetric, because $k$ and $j_2$ both lie in ${\cal D}_2$ so $j_2 \rightarrow k$ is just as likely as $k \rightarrow j_2$,
and each direction had the same chance of being aborted at the earlier stages.
(This would no longer be true if $j$ was being obtained by addition or subtraction instead of bit-randomisation, 
because $k$ and $j_2$ would have approached each other via different ranges of $j_1$ and would hence have had different chances of aborting then: 
that's why I use the binary scheme.)
So, if the likelihood for $j_2$ is greater than $a$, accept it.

If not, keep going by randomising fewer and fewer low-order bits, until an acceptable likelihood is reached.
At worst, this procedure terminates after $B$ trials with the original state $k$, necessarily acceptable because $a$ was constructed to be beneath $L(k)$.
More likely, an acceptable state will be found somewhere around the scale of the controlling likelihood function, at which $\Delta \log L = {\cal O}(1)$.
All we ask of the likelihood is that it be a reasonably continuous function of position, so that there {\bf is} an acceptable scale.

\centerline{
\vbox{\offinterlineskip{
\halign{&   #  &  #  &  #  &  #  &  #  &  #  &  #  &  #  &  #  &  #  &  #  &  #  &  #  &  #  &  #  &  #  &  #\vphantom{\Big(}\cr
        &      &     &     &     &  &\ \qquad$k$&  &     &     &     &     &     &     &     &     &     &      \cr
        \noalign{\hrule}
        &\H\ \ &     &     &     &     &     &     &     & \Da &     &     &     &     &     &     &     &\ \ \H\cr
        \noalign{\hrule}
        &\H\ \ &     &     &     & \Db &     &     &     & \H  &     &     &     & \P  &     &     &     &\ \ \H\cr
        \noalign{\hrule}
        &\H\ \ &     & \P  &     & \H  &     & \Dc &     & \H  &     & \P  &     & \H  &     & \P  &     &\ \ \H\cr
        \noalign{\hrule}
        &\H\ \ & \P  & \H  & \P  & \H  & \Dd & \H  & \P  & \H  & \P  & \H  & \P  & \H  & \P  & \H  & \P  &\ \ \H\cr
        \noalign{\hrule}
        &      &     &     &     &    &$j_3$\ &    &     &     &     &     &     &     &     &     &     &      \cr
       }
}}}
\noindent
There appear to be barriers to free movement at special places such as $x = {1\over2}$
(where passage between adjacent integers $0111111\cdots$ and $1000000\cdots$ requires all $B-1$ low bits to change).
These can be made harmless by carrying out the procedure with respect to a randomly offset origin.

There is nothing special about the halving procedure just described, other than being simple to implement in integer arithmetic.
Any set of pre-defined nested domains would suffice: slice sampling was introduced by Neal (2003)  in this more general form.
Actually, Neal started his slice-sampling algorithm from some intermediate length scale, 
and could step outward by widening the domain as well as inward by shrinkage.
In a multi-atom environment, that is less important.
Each atom has left and right neighbours along the Hilbert curve, and their location will often give sensible limits on that atom's movement.
A location beyond can be considered ``out-of-range'', and rejected at once.
Stepping out is no longer particularly necessary, because the early steps inward from the full number of bits 
mostly involve merely the trivial overhead of checking whether a trial location is in range.

The following fragment of pseudo-code shows the basic implementation of binary slice-sampling (Skilling and MacKay, 2003).
The entry state is represented by the $Wd$-bit integer $k$.
\halign
{  #                                                                                \hfill & \quad #                                         \hfill \cr
                                                                                           &                                                        \cr
\hbox{$\qquad b \leftarrow W \times d                                                   $} & full number of bits $B$                                \cr
\hbox{$\qquad o \leftarrow {\tt Uniform}[\,0,2^b)                                       $} & random origin                                          \cr
\hbox{$\qquad a \leftarrow {\tt Uniform}\big(\, 0 ,\, L(k) \big)                        $} & level of acceptance probability                        \cr
\hbox{$\qquad {\bf do}\{                                                                $} & loop                                                   \cr
\hbox{$\qquad\qquad j \leftarrow \big(\,(k - o) \oplus {\tt Uniform}[\,0,2^b)\,\big) + o$} & trial position around $k$                              \cr
\hbox{$\qquad\qquad b \leftarrow b - 1                                                  $} & shrink interval around $k$                             \cr
\hbox{$\qquad\} {\bf while}(\,j \hbox{\ is out-of-range {\bf or} } L(j) < a\,)          $} & until $j$ is in range and if so is acceptably probable \cr
                                                                                           &                                                        \cr
}
\noindent
The fact that the position almost certainly changes (exit state $j \ne k$) more than justifies the mild extra cost of slice sampling, 
as opposed to straightforward Metropolis-Hastings rejection.
\vfill\eject

\noindent{$\underline{\hbox{\bf{Section 4. Annealing}}}$}
\bigskip

In any large application, it is true almost by definition that the significant ``volume'' of the posterior occupies only a tiny fraction of the volume of the prior.
Technically, the {\bf information}, or negative entropy
$$
    H = + \int P(\theta) \log\bigl(P(\theta) / \pi(\theta)\bigr) d\theta,
 \quad    P = \hbox{posterior},\ \pi = \hbox{prior},
$$
is likely to be much bigger than unity, in keeping with the dimensionality of the problem.
It is then difficult to jump directly from the prior to the posterior, which is why we need a Markov chain.
Practically all samples from the prior will lie out on the far tails of the posterior, 
where the local structure may give poor guidance about how to approach the isolated peak(s).
To avoid this, we divide the application into subsidiary steps within which $H$ does not increase too much.
We arrange that there is significant overlap between the initial and final distributions of each step, and the computation can then proceed in reasonable safety.

The traditional problem of locating a needle in a haystack is analogous: the needle occupies 1 unit in a haystack of volume $V = \exp(H)$.
Instead of a direct search, which would take $V$ or so trials, we divide the haystack into halves, quarters {\it etc}, 
thus managing to locate the needle in $\log(V) = H$ steps instead.

Although there are many possible paths between the prior and the posterior, 
one particular path (introduced by Kirkpatrick {\it et.al.} 1983) is specially sympathetic to the formalism.
Gelman \& Meng (1998) suggest more general paths, because all that matters is good overlap between successive steps.
I have tried a couple of others, but found no advantage.
Let $L(\theta)$ be the likelihood
$$
    L(\theta) = \Pr(D \mid \theta)
$$
and let $\lambda$ be a numerical coefficient.
Instead of working with $L$ directly, 
we work with a modified likelihood $L^\lambda (\theta)$ which induces a modified posterior proportional to $L^\lambda (\theta) \pi(\theta)$.

Setting $\lambda = 0$ switches the likelihood off ($L^0 = 1$), so the modified posterior is just the prior.
Setting $\lambda = 1$ switches the likelihood on ($L^1 = \hbox{likelihood}$), so the modified posterior is simply the true posterior.
And, by increasing $\lambda$ gently from 0 to 1, we can divide the application into small, safe steps.
In fact we can continue further: making $\lambda$ even larger makes the modified posterior sharpen around the point(s) of maximum likelihood.
$$
\matrix {
           \lambda = 0 &\hbox to 20pt{\rightarrowfill}
          &\lambda = 1 &\hbox to 20pt{\rightarrowfill}
          &\lambda = \infty                             \cr
           \hbox{prior}     & 
          &\hbox{posterior} & 
          &\hbox{maximum}                              \cr
        }
$$
We can thus use the same program for {\bf both} Bayesian sampling of the posterior {\bf and} maximum-likelihood, or indeed for maximising an arbitrary function.

In a thermodynamic analogy that is productive of terminology, $\lambda$ is inverse temperature $1/T$ (so we call it ``coolness''),
and $-\log L$ is energy $\cal E$, whence the modified likelihood factor becomes the familiar $\exp(-{\cal E}/kT)$ with Boltzmann's constant $k=1$.
This is why passage along this particular path is called ``annealing''.
We talk of an ensemble of several objects $\{\theta\}$ being ``at equilibrium'' if they sample the modified posterior faithfully.

Annealing also allows us to calculate the values of evidence $E$ and information $H$.  
We generalise $E$, the evidence
$$
    E \equiv \Pr(D) = \int \Pr(D \mid \theta) \Pr(\theta) d\theta
      = \int L\,d\pi,
$$
to
$$
    E(\lambda) \equiv \int L^\lambda d\pi
$$
which differentiates to give
$$
 {d(\log E) \over d\lambda}
   = {\int L^\lambda \log L \,d\pi \over \int L^\lambda d\pi}
   \equiv \langle \log L \rangle_\lambda\,.
$$
As the central expression defines, the angle-brackets here denote averaging over the posterior as modified to coolness $\lambda$.
The purpose of annealing is to calculate the posterior (by sampling) at unit coolness, 
so {\it a fortiori} we can sample at intermediate coolnesses to pick up the required averages.
And, because $E(0) = \int d\pi = 1$, the evidence value we seek is their sum
$$
    \Pr(D) = E(1) = \exp \int_0^1 {d(\log E) \over d\lambda}\,d\lambda = \exp \int_0^1 \langle \log L \rangle_\lambda\,d\lambda\,.
$$
(I was startled by this result when I stumbled across it, and enthusiastically took it to the late -- and widely missed -- Edwin Jaynes.
He looked at it and remarked that the formula was ``well known to those who know these things''.
Thank you Ed.  Such identities are indeed commonplace in thermodynamics.
In numerical work, the method is called ``thermodynamic integration''.)
A similarly short derivation yields
$$
    H(\lambda) = \lambda \langle \log L \rangle_\lambda - \log E(\lambda),
$$
with $\lambda = 1$ being the case Bayesians usually want.

How should we control $\lambda$?
One way of proceeding is to set up some chain of values of $\lambda$, and let the object(s) diffuse up and down, 
as well as equilibrating in $\theta$ at the current coolness.
This is known as ``simulated tempering'' (Marinari \& Parisi 1992).
If the chain is appropriate (so that the beginning and end of each link overlap well enough), and if the links are weighted by their correct evidence values, 
then any object that successfully wanders from the prior end of the chain to the posterior end, and back, 
is claimed to have given a faithful sample of the posterior.
That sounds attractive.  But the evidence values are numerical, and rely upon adequate equilibration of $\theta$, so the guarantee is not what it seems.  
If an implementation failed to move $\theta$ at all, it would still appear to work.  
A variant, ``replica exchange'' (Swendsen \& Wang 1986), starts by putting an object on each link of the chain, and letting them exchange position as they equilibrate.  
This avoids having to pre-acquire evidence values, but it starts far from equilibrium because at the beginning we only know how to sample $\theta$ at the prior end.
Replica exchange, like everything else, relies on successful equilibration in $\theta$.
Even ``exact sampling'' (Propp \& Wilson 1996), which is based in similar ideas, cannot succeed if $\theta$ evolves too slowly.

I prefer not to control $\lambda$ by diffusion.  
The number of coolness values will be $H$ or so, so that an object will take $H^2$ iterates to diffuse along the chain.  
This seems wasteful by a large factor $H$.  
I think it is better to cool systematically, even if equilibration must thereby be damaged.  
And I think it wise to take a hint from physics, where the efficient route to equilibrium is usually through slow cooling, 
trying to remain always close to equilibrium.  
All too often, systems lock into metastable states if they are quenched too rapidly.

A successful program, then, needs an ``annealing schedule'' that defines how fast the coolness $\lambda$ may increase.
Crudely, the change in $\lambda$ per cooling step should presumably allow the relative information between its beginning and end to be $\cal O$(1), 
hence the suggestion (by Otten \& van Ginneken 1984, and others) that information should increase at a roughly constant rate,
whatever rate that may be.
But the matter is a little more subtle than that, and I think that you, the user, should have control of the overall numerical rate, 
if only because applications differ in their difficulty.

\vfill\eject
\noindent{4.1. SELECTIVE ANNEALING}
\smallskip

Let us use an ensemble of $\cal N$ member objects, perhaps $10-100$ of them, with the $j^{\rm th}$ having likelihood $L_j$.
Suppose this is in equilibrium at coolness $\lambda$.
We now wish to cool by an additional $\delta \lambda$.
This could be done by weighting the objects by
$$
    w_j \propto L_j{}^{\delta\lambda}
$$
or, with normalisation to $\langle w \rangle = 1$,
$$
    w_j = (L_j / \overline L)^{\delta\lambda}, \quad
 \overline L = \big\langle L^{\delta\lambda} \big\rangle^{1 / \delta\lambda}.
$$
Weighted averages over the objects would then be faithful to the new coolness.  
However, log-likelihoods are usually large, so that the weights would become very non-uniform after significant cooling, 
leading to gross numerical inefficiency as computer time was wasted on improbable objects.
The ``annealed importance sampling'' method of Neal (2001) uses similar weights 
to discriminate between objects trapped in different modes of a multimodal distribution, but suffers from similar inefficiency in large problems.
(Actually, Neal used accumulated evidence values instead of likelihoods, but these have an uncertain status in BayeSys, which allows objects to mix their identity by exchanging atoms.)

Instead, draw $\cal N$ new samples from the weighted ensemble, ensuring a mean multiplicity $\langle n_j \rangle = w_j$ for each original object.  
The members of the new ensemble will then have equal weight again.
To reduce changes in the ensemble, it is desirable to keep the actual (integer) number of copies $n_j$ as close as possible to its mean, 
being either the integer immediately below or immediately above $\langle n_j \rangle$.
It is further desirable to treat similar objects as similarly as possible.
For example, if 10 objects have $\langle n \rangle = 0{\cdot}3$, we should keep exactly 3 of them, omitting the other 7.  
Likewise, if 10 objects have $\langle n \rangle = 2{\cdot}4$, we should keep exactly 24 of them, taking either 2 or 3 copies of each.  

These desiderata can best be satisfied by ordering the $L_j$ into increasing (or decreasing) order: for example with ${\cal N} = 4$ we might have:-
\smallskip
\centerline{\vbox{\offinterlineskip
      \halign{  &\vrule# &
             \strut\quad#\hfil\quad &
                           \vrule#\quad &
                             \strut\hfil#\hfil&
                                           \strut\hfil#\hfil&
                                                       \strut\hfil#\hfil&
                                                                     \strut\hfil#\hfil&
                                                                                 \strut\hfil#\hfil&
                                                                                               \strut\hfil#\hfil&
                                                                                                           \strut\hfil#\hfil&
                                                                                                                         \strut\hfil#\hfil&
                                                                                                                               \strut\hfil#\hfil&
                                                                                                                                \quad\vrule#      \cr
              \noalign{\hrule}
  height2pt & \omit                   & &\omit&   \omit     &   \omit   &   \omit     &   \omit   &   \omit     &   \omit   &   \omit     &\omit& \cr
            & Object $j$              & &     &     1       &           &     2       &           &     3       &           &     4       &     & \cr
            & Normalised weight $w_j$ & &     & $0{\cdot}5$ &           & $0{\cdot}7$ &           & $1{\cdot}0$ &           & $1{\cdot}8$ &     & \cr
            & Cumulant weight         & &  0  &$\rightarrow$&$0{\cdot}5$&$\rightarrow$&$1{\cdot}2$&$\rightarrow$&$2{\cdot}2$&$\rightarrow$&  4  & \cr
  height2pt & \omit                   & &\omit&   \omit     &   \omit   &   \omit     &   \omit   &   \omit     &   \omit   &   \omit     &\omit& \cr
              \noalign{\hrule}
             }
      }
}
\smallskip
\noindent We then draw an initialising random variable $r \in \hbox{\tt Uniform}(0,1)$ to set a sequence \hbox{$\{r,\,r+1,\,r+2,\ldots\}$}.  
Whenever one of the sequence intersects the cumulant weight, we draw that one as a new object.
For example, if $r=0{\cdot}4$ the sequence would be $\{0{\cdot}4,\,1{\cdot}4,\,2{\cdot}4,\,3{\cdot}4\}$, which intersects the cumulants at \hbox{$j = \{1,3,4,4\}$}.
The least likely object 1 is drawn once, object 2 is omitted, object 3 is taken once, and the most likely object 4 is taken twice.  
The net effect is to copy object 4 onto object 2, duplicating the former while deleting the latter.
Depending on the value of $r$ (random between 0 and 1), the various multiplicities would have been:-
\smallskip
\centerline{\hbox{
\vbox{\offinterlineskip
      \halign{\strut#\hfil\  \cr
       $r=1$         \cr
       $\vdots$      \cr
       $r=0{\cdot}5$ \cr
       $r=0{\cdot}2$ \cr
       $r=0$         \cr
                     \cr
                     \cr
             }
      }
\vbox{\offinterlineskip
      \halign{  &\vrule#&
     \strut\ \hfil#\hfil\ &
             \strut\ \hfil#\hfil\ &
                     \strut\ \hfil#\hfil\ &
                             \strut\ \hfil#\hfil\ &
                                                        \vrule# &
                                                            \strut\ #\hfil
                                                                        \cr
              \omit     &     1     &     2     &     3     &     4     & \omit & Object $j$ \cr
              \multispan6\hrulefill                                             & \omit \cr
                        &           &           &           &           &       & \omit \cr
                        &     0     &     1     &     1     &     2     &       & \omit \cr
                        &           &           &           &           &       & Actual multiplicities $n$ \cr
              \multispan6\hrulefill                                             & \omit \cr
                        &     1     &     0     &     1     &     2     &       & \omit \cr
              \multispan6\hrulefill                                             & \omit \cr
                        &     1     &     1     &     1     &     1     &       & \omit \cr
              \multispan6\hrulefill                                             & \omit \cr
              \omit     &           &           &           &           & \omit & \omit \cr
              \multispan6\hrulefill                                             & \omit \cr
              \omit     &$0{\cdot}5$&$0{\cdot}7$&$1{\cdot}0$&$1{\cdot}8$& \omit & Mean multiplicities $\langle n \rangle$ \cr
              \multispan6\hrulefill                                             & \omit \cr
             }
      }
}}
\smallskip
\noindent
Necessarily, the mean multiplicities agree with the quoted weights.

I suggest that $\delta\lambda$ should be chosen so that in one cooling step the average number of copy operations per object should be limited to some maximum 
--- to be supplied by you the user as a floating-point number $\tt Rate$ (which will often be set less than 1 for safety).
$$
    \langle\,|w-1|\,\rangle \approx {\tt Rate}/3 \qquad \Rightarrow \qquad \delta\lambda
$$
The factor 3 is merely for rough backward compatibility with earlier code, and $\langle\,|w-1|\,\rangle$ is a monotonically increasing function of $\delta\lambda$, 
so this equation is easily soluble numerically.
However, accidental coalescence of all the likelihoods $L_j$ could make $\delta\lambda$ damagingly large.
To make this less likely even when the ensemble is small, we can include a dozen or so extra likelihoods taken from recent ensembles.
Even if these are a little atypical for the current coolness, it won't matter because the enhanced range will merely slow the cooling down a little, adding to safety.
As a second, possibly un-necessary, line of defence, $\delta\lambda$ can be prevented from more than doubling between cooling steps.
Such reductions in $\delta\lambda$ stabilise the annealing schedule.
They only matter when $\cal N$ is fairly small, but they enable the schedule to operate sensibly even if $\cal N$ is only 1 (when copying objects is impossible).
Finally, the user can impose whatever limit on coolness is appropriate, either 1 for Bayesian calculations, or no limit for maximisation.

\bigskip
\noindent{4.1.1. \it Imperfections}
\smallskip

Selective annealing cannot cool perfectly, because the ensemble is only finite.  
For a start, the member objects must lose some independence through the duplications, 
so that after two or more cooling steps the ensemble is no longer quite at equilibrium.
So annealing must still be accompanied by equilibration over $\theta$: there is never any escape from that.

If $\delta\lambda$ were to be set large, the ensemble still could not anneal further than to the largest likelihood in its current membership,
and this limits the amount of annealing that {\rm appears} to take place.
Also, the estimate of $\overline L$ that underlies the weights fluctuates statistically by ${\cal N}^{-1/2}$, 
and this induces a ${\cal O}({\cal N}^{-1})$ systematic reduction in apparent cooling.
Just as in elementary statistics when we estimate variance as $\sum_1^{\cal N}(x - \overline x)^2 / ({\cal N} - 1)$ instead of na{\"\i}vely dividing by $\cal N$, 
we have the fixup
$$
 \delta\lambda^{\rm (apparent)} = {{\cal N} - 1 \over {\cal N}}\ \delta\lambda^{({\rm from}\ w)}.
$$
Presumably it is slightly better to use this apparent cooling in the actual updating of $\lambda$, 
though the effect is invisible if the exploration of $\theta$ is reasonably efficient after each update.

The evidence value $E$ should be accumulated as 
$$
 \log E = \sum_{\lambda=0}^{\lambda=1} \langle \log L \rangle_\lambda\,\delta\lambda\,.
$$
As it stands, though, this expression is noisy and biassed.
It is noisy because each $\langle \log L \rangle_\lambda$ is the merely-numerical average of $\cal N$  hopefully-random samples at coolness $\lambda$.
More subtly, it is biassed upwards because random upward excursions in $\log L$ are amplified non-linearly in the weights $w = L^{\delta\lambda}$ 
which underlie the selection of $\delta\lambda$.
The effect is to increase $\delta\lambda$ so that the contribution to $\log E$ is over-weighted relative to the opposing downward excursions.
To reduce these effects, we can impose the analytical constraint that $\langle \log L \rangle_\lambda$ must be a non-decreasing function of $\lambda$,
the reason being
$$
 d\langle \log L \rangle_\lambda\,/d\lambda = {\rm variance}(\log L)_\lambda \ge 0.
$$
Arithmetically, we keep the historical record of $\{\lambda,\langle \log L \rangle_\lambda\}$ pairs.
But, if $\langle \log L \rangle_\lambda$ tries to decrease, we spread the deficit back along the record until the constraint is satisfied.
Instead of being a noisy curve with systematic upward trend, the record is now a rising staircase.
This fixup suppresses most of the noise, and reduces the bias indefinitely when cooling is sufficiently slow.
Estimates of $E$ and $H$ are thereby improved.

\bigskip
\noindent{4.1.2. \it Properties}
\smallskip

The most important feature of selective annealing is that objects in the ensemble jump from ``bad'' low-likelihood positions 
to such ``good'' high-likelihood places as have currently been found.
Hence objects can escape from local maxima of likelihood without having to find an exit geometrically, or by tunnelling.
In a multimodal problem having several likelihood peaks, only one object needs to find the ``right'' mode for all the others to copy across in due course.
And, because of the regular nature of copying in this implementation, no ``good'' object $(L \ge \overline L)$ is ever destroyed.

Qualitatively, the cooling speed $\delta\lambda$ per step varies inversely with the log-likelihood range.  
The general formula is non-linear, but the small-speed limit is
$$
    \delta\lambda = {\tt Rate} \,/\, (\log L_{\rm max} - \log \overline L).
$$
Thus, whenever the likelihoods are widely separated because of a phase change or other difficulty in the application, 
the cooling rate automatically slows in proportion to compensate.  
Cooling also slows if one object suddenly acquires a much higher likelihood than the others.
Again, the program ``recognises'' that something interesting has happened, and takes care to slow down.

In the small-speed limit, the relative information between the beginning and end of a step depends on the standard deviation of the log-likelihood as
$$
    \delta H \simeq \big((\log L)_{\rm r.m.s.}\, \delta\lambda \big)^2 \sim {\tt Rate}^2.
$$
Thus selective annealing is broadly in line with the hope that this relative information should be held more-or-less constant and ${\cal O}(1)$.
However, I prefer to fix the cooling by selective annealing because $(a)$ it has a direct computational interpretation in terms of copy operations, 
$(b)$ it takes note of the single most probable object, which seems sensible when that one happens to be unusual, and
$(c)$ the steps can be arbitrarily small; even a step where the ensemble chanced not to change at all has nevertheless annealed by the imposed $\delta\lambda$.

Goggans and Chi (2004) give an account of the annealing methods outlined here.

\bigskip
\noindent{4.2. COMPARISON WITH STATISTICAL THERMODYNAMICS}
\smallskip

Statistical thermodynamics is the application of probabilistic methods to large physical systems,
and some of its formulas correspond closely with ours.
Our basic Bayesian formulas are:

{\openup1\jot
\halign{\indent  # \hfil&  $\quad\colon\quad # $                                                                      \hfil\cr
  Prior                 & \hbox{$\pi = \pi(\theta)$ where symbol $\theta$ includes number of atoms $N$ and all attributes} \cr
  Likelihood            & \hbox{$L = L(\theta)$ being explored at coolness $\lambda$}                                      \cr
  Evidence              & E = E(\lambda) = \smallint \pi L^\lambda d\theta                                                 \cr
  Annealed posterior    & P = P_\lambda(\theta) = \pi\, L^\lambda / E                                                      \cr
  Information=$-$Entropy& H = H(\lambda) = -\log E + \lambda\,d\log E / d\lambda = +\smallint P \log P \,d\theta           \cr
                        & \langle \log L \rangle = d\,\log E / d\lambda = \smallint P\log L \,d\theta                      \cr 
                        & d\log E = \langle \log L \rangle d\lambda                                                        \cr
                        & H = -\log E + \lambda \langle \log L \rangle                                                     \cr
       }
}\vskip 2pt
\noindent
These are strikingly similar to formulas for a canonical (fixed $N$) ensemble of $\cal V$-state systems.

{\openup1\jot
\halign{\indent  #       \hfil&  $\quad\colon\quad # $                                     \hfil\cr
  List of states              & j = 1,2,\ldots,{\cal V}                                         \cr
  Energy states               & \hbox{${\cal E}_j$ being occupied at temperature $T$}           \cr
  Canonical partition function& Q = Q(T) = \sum \exp(-{\cal E}_j/T)                             \cr
  Probability of state        & P = P(j \mid T) = \exp(-{\cal E}_j/T)\,/\,Q                     \cr
  Entropy                     & S = S(T) = \log Q + T\, d\log Q/dT = - \sum P(j)\log P(j)       \cr
  Energy                      & \langle{\cal E}\rangle = T^2\,d\log Q/dT = \sum P(j) {\cal E}_j \cr 
                              & d\log Q = \langle{\cal E}\rangle\,dT/T^2                        \cr
                              & S = \log Q + \langle{\cal E}\rangle / T                         \cr
       }
}\vskip 2pt

\noindent
This invites the following identifications:

{\openup1\jot
\halign{\hskip 150pt $\hfil#\hfil\quad $&$     #             $&$ \quad\hfil#\hfil       $\cr
                    {\rm Thermodynamic} &                     & {\rm Bayesian}           \cr
                              j         & \longleftrightarrow & \theta                   \cr
                       {\cal E}_j       & \longleftrightarrow & -\log L(\theta)          \cr
                \langle{\cal E}\rangle  & \longleftrightarrow & -\,\langle\log L \rangle \cr
                        Q\,/\,{\cal V}  & \longleftrightarrow &  E                       \cr
                         1\,/\,T        & \longleftrightarrow & \lambda                  \cr
                      S - \log{\cal V}  & \longleftrightarrow & - H                      \cr
        }
}\vskip 2pt
Statistical thermodynamics being a well-developed discipline, it may contain other techniques of use to us.
In particular, there is a grand canonical ensemble which explicitly allows $N$ to vary according to a chemical potential $\mu$.
Actually, the list of states in the canonical ensemble could already include differing numbers $N$ of component parts, 
so from our point of view the grand canonical ensemble may offer nothing new.
Let's see.
The thermodynamic formulas for the grand canonical ensemble are:

{\openup2\jot
\halign{ \quad   #             \hfil&  $\quad\colon\quad # $                                                                                             \hfil\cr
  List of states                    & \hbox{$j = 1,2,\ldots$ for each number of atoms $N$}                                                                    \cr
  Energy states                     & \hbox{${\cal E}_j(N)$ with temperature $T$}                                                                             \cr
  Grand canonical partition function& \Xi = \Xi(T,\mu) = \sum_{N,j} \exp(-({\cal E}_j(N) - N\mu)/T)                                                           \cr
  Probability of state              & P = P(N,j \mid T,\mu) = \exp(-({\cal E}_j(N) - N\mu)/T)\,/\,\Xi                                                         \cr
  Entropy                           & S = S(T,\mu) = \log\Xi + T\,\partial\log\Xi/\partial T = - \sum P(N,j)\log P(N,j)                                       \cr
  Number                            & \langle N \rangle = T\,\partial\log\Xi/\partial\mu = \sum P(N,j) N                                                      \cr 
  Energy                            & \langle{\cal E}\rangle = T(\mu\,\partial\log\Xi/\partial\mu + T\,\partial\log\Xi/\partial T) = \sum P(N,j){\cal E}_j(N) \cr 
                                    & S = \log\Xi + (\langle{\cal E}\rangle - \mu \langle{\cal N}\rangle)/T                                                   \cr
       }
}\vskip 4pt

The two ``potentials'' $T$ and $\mu$ allow a variety of ``annealing'' paths through the two-dimensional $(T,\mu)$ space,
and it looks as if this could enable more direct control over the number of atoms.
The geometric prior $\Pr(N) = \pi(N) = c^N$ in particular looks attractively similar to the chemical potential factor $e^{N \mu / T}$.
However, in a Bayesian calculation powering the prior $\pi$ is quite different from powering the likelihood $L$.
For a start, it interferes with normalisation, which matters for a prior but not for a likelihood.
At a more formal level, the prior represents our prior assumptions and is a {\bf measure}.
It is an intrinsically additive quantity, which can only appear linearly, as in $\int \pi(\theta)\ldots d\theta = \int\ldots d\pi$.
If we were, regardless, to raise $\pi$ to (say) the zeroth power, we would in effect be assigning uniform weight with respect to the coordinates.
But coordinates are arbitrary, raising a difficulty about consistent treatment.
Indeed, in the case of a number of atoms, we certainly do {\bf not} want to assign uniform weight over a possibly unlimited number.
In our Bayesian context, different numbers of atoms are {\bf not} {\it a-priori-}equivalent.
So we do {\bf not} want the powering coefficient $\mu/T$ to alter.
That sits uneasily with wanting $T$ to anneal downwards from $\infty$, and suggests annealing with $\mu/T$ held constant.
But that removes the whole point of considering the grand canonical ensemble in the first place,
because we already have $\mu/T = \log c = {\rm constant}$ as the geometric prior we started with.
Once more, we find the more general approach being unhelpful.

Out of interest, I have tried ``annealing'' $\mu$ instead of $\lambda$, starting with $\mu = -\infty$ at which no atoms are present,
and gradually allowing more atoms to appear.
The method was hopelessly inefficient, because the full force of the likelihood was seen by each fresh atom,
and this was wholly unsympathetic to easy movement.
Finally, it would be permissible to factorise the likelihood into two or more factors $L = L_1L_2\cdots$, each with its own coolness $\lambda_1,\lambda_2,\cdots$,
and to take an arbitrary path from coolnesses all 0 to all 1.
I see no advantage.
\vfill\eject

\noindent{$\underline{\hbox{\bf{Section 5. The BayeSys program}}}$}
\bigskip

BayeSys (an acronym for {\bf Baye}sian {\bf Sys}tem) is a program for sampling the posterior of a system having an atomic prior,
along with calculating the evidence value.
The program incorporates several engines for creating, destroying, and moving the atoms efficiently.
It operates on an ensemble of objects, and allows them to communicate so that they can catalyse each other's progress.
The Massive Inference extension can be used if some of an atom's coordinates represent additive flux values.
As BayeSys attempts to modify the atoms, it passes each suggested modification to your user-procedures, which inform it about the associated likelihood change.
In the light of this, it accepts, rejects, or changes its suggestion.
After each complete iterate, when each atom has likely been changed, BayeSys passes all its atoms back to your procedure {\tt UserMonitor},
for display of the current ensemble of sample objects $\{\theta\}$, and collection of any statistics you want.
At this point, you may want to re-calibrate any ``nuisance'' parameters $\phi$ that had to be assigned.
More properly, you should re-sample them from their posterior probability as calculated in accordance with the current object they are related to.
This alternate sampling of relevant parameters (here $\theta$ and $\phi$) is known as ``Gibbs sampling'' (Geman \& Geman 1984). 
It allows the joint posterior of atoms and nuisance parameters to be explored faithfully, 
which is how nuisance parameters are correctly estimated and eliminated.
\bigskip
$$
\matrix{
     & &
       {\vbox{\offinterlineskip
             \halign{  \vrule# & \strut\ \ \hfil#\hfil\ & \vrule# \cr
                     \noalign{\hrule}
             height2pt         &        \omit               &     \cr
                               &         Start              &     \cr
                               &Initialise parameters $\phi$&     \cr
             height2pt         &        \omit               &     \cr
                     \noalign{\hrule}
                    }
             \vskip 6pt
             }
       }
     & &
        \cr
     & &
       \Biggl\downarrow & & \cr
       {\vbox{\offinterlineskip
            \halign{  \vrule# & \strut\ \ \hfil#\hfil\ & \vrule# \cr
                     \noalign{\hrule}
            height2pt         &          \omit              &         \cr
                              &     Response to new         &         \cr
                              &     positions or            &         \cr
                              &     numbers of atoms        &         \cr
            height2pt         &          \omit              &         \cr
                     \noalign{\hrule}
                        \omit  &  {\tt User procedures}     &  \omit  \cr
                   }
          }
       }
       &
       {\vbox{\offinterlineskip
            \halign{ \strut\ \hfil#\hfil\  \cr
                     \leftarrowfill        \cr
                     \phantom{xxxxxx}      \cr
                     \rightarrowfill       \cr
                   }
             \vskip 8pt
             }
       }
       &
       {\vbox{\offinterlineskip
             \halign{  \vrule width2pt# & \strut\ \ \hfil#\hfil\ & \vrule width2pt# \cr
                     \noalign{\hrule height2pt}
             height6pt         &          \omit            &         \cr
                               &     Atoms of new object   &         \cr
                               & $\theta$, given $\phi$    &         \cr
             height6pt         &          \omit            &         \cr
                     \noalign{\hrule height2pt}
                        \omit  &  {\tt BayeSys/MassInf}    &  \omit  \cr
                   }
             }
       }
       &
       {\vbox{\offinterlineskip
            \halign{ \strut\ \hfil#\hfil\  \cr
                     \rightarrowfill       \cr
                     \phantom{xxxxxx}      \cr
                     \leftarrowfill        \cr
                   }
             \vskip 8pt
             }
       }
       &
       {\vbox{\offinterlineskip
            \halign{  \vrule# & \strut\ \ \hfil#\hfil\  & \vrule# \cr
                     \noalign{\hrule}
            height2pt         &       \omit             &         \cr
                              &  Display object $\theta$&         \cr
                              &  Collect statistics     &         \cr
                              & (New parameters $\phi$) &         \cr
            height2pt         &       \omit             &         \cr
                     \noalign{\hrule}
                        \omit  &  {\tt UserMonitor}     &  \omit  \cr
                   }
          }
       }
         \cr
     & & \Biggl\downarrow & & \cr
     & &
       {\vbox{\offinterlineskip
             \halign{  \vrule# & \strut\ \ \hfil#\hfil\ & \vrule# \cr
                     \noalign{\hrule}
             height3pt         &        \omit           &         \cr
                               &        Finish          &         \cr
             height2pt         &        \omit           &         \cr
                     \noalign{\hrule}
                    }
             \vskip 6pt
             }
       }
     & &
        \cr
     }
$$
\bigskip
It is your responsibility to demand that the computation finish --- by signalling from {\tt UserMonitor}.

\vfill\eject
\noindent{5.1. THE BAYESYS PRIOR}

\bigskip
\noindent{5.1.1. \it Number of atoms}
\smallskip

As programmed in BayeSys, the prior $\Pr(n)$ for the number of atoms can be styled ``uniform'', ``Poisson'' or ``geometric'',
according to the sign ($0, +, -$) of the input parameter ${\tt Alpha} = \alpha$.
In each case, the distribution is qualified by a minimum number $M$ and a maximum number $N$ supplied through parameters {\tt MinAtoms} and {\tt MaxAtoms},
$$
    M \le n \le N.
$$

$M$ is given directly by {\tt MinAtoms}.
It must be positive ($1,2,\ldots$) and not 0 (or negative).
This means that BayeSys excludes the empty object with no atoms --- a restriction imposed for the convenience of both author and user.
If you the user really want to include the empty object, you can treat it as an alternative prior hypothesis, weighted as always through Bayes' theorem with
$$
\eqalign{
  \Pr(D\mid n>0) \,&=\, \hbox{Evidence from BayeSys (given as its logarithm);}  \cr
  \Pr(D\mid n=0) \,&=\, \hbox{Likelihood}(\hbox{Data}\mid\hbox{empty object}).  \cr
        }
$$

$N$ is given by {\tt MaxAtoms}, usually directly, except that ${\tt MaxAtoms} = 0$ codes for $N = \infty$ (absence of a maximum).
Omitting the maximum is prohibited for a uniform prior, which would become improper.
Obviously, $N$ must be at least as large as $M$.
Equality is allowed, and forces BayeSys to use exactly that number of atoms.

The ``uniform'' prior (${\tt Alpha} = 0$) is
$$
  \Pr(n\mid\alpha\!=\!0) \,=\, (N-M+1)^{-1}\qquad \hbox{ for $M \le n \le N$}
$$
with mean and variance
$$
  \langle n\rangle = \hbox{$1 \over 2$}(N + M), \quad\quad  {\rm var}(n) = \hbox{$1 \over 12$}(N-M)(N-M+2).
$$

The ``Poisson'' or binomial prior is specified by ${\tt Alpha} = \alpha > 0$.
With finite maximum, the implementation is binomial, offset by the minimum number.
$$
  \Pr(n\mid\alpha\!>\!0) \,=\, { (N-M)! \over (n-M)!\,(N-n)! }\, q^{n-M} (1-q)^{N-n}\ ,  \qquad q = {\alpha \over \alpha + N - M}\,.
$$
Supplying the binomial rate through ${\tt Alpha} = \alpha$ rather than directly as $q$ makes it impossible to supply an unusable value $q>1$.
The binomial mean and variance are
$$
  \langle n\rangle = (1-q)M+qN, \quad\quad  {\rm var}(n) = (N-M)\,q(1-q).
$$
If the maximum is omitted ($N = \infty$), the binomial reduces to Poisson, offset by the minimum number.
$$
  \Pr(n\mid\alpha\!>\!0) = e^{-\alpha} \alpha^{(n-M)} / (n-M)!\,, \quad\quad   \langle n\rangle = M+\alpha, \qquad {\rm var}(n) = \alpha
$$

The ``geometric'' prior is specified by ${\tt Alpha} = \alpha < 0$, exhausting all the setting possibilities.
It is
$$
  \Pr(n\mid\alpha\!<\!0) = {1-c \over 1-c^{N-M+1}}\,c^{n-M},   \qquad c = {|\alpha| \over |\alpha| + 1}.
$$
Again, supplying the ratio through {\tt Alpha} in this way makes it impossible to supply a ratio $c>1$  which could be unusable.
The limit $\alpha \to -\infty$ is the uniform case alternatively coded as ${\tt Alpha} = 0$.
If the maximum is omitted, the geometric prior is no longer truncated and is
$$
\eqalign{
  \Pr(n\mid\alpha\!<\!0) &= (1-c)\,c^{n-M}, \qquad c = {|\alpha| \over |\alpha| + 1},           \cr
  \langle n\rangle &= M + |\alpha|, \qquad {\rm var}(n) = |\alpha|\,\big(|\alpha| + 1\big). \cr
        }
$$
Thus, as for the Poisson prior, $\alpha$ directly yields the suggested number of atoms.

Your selection of prior should be influenced by your expectation of the degree of complexity $\langle n\rangle$ that you expect in the object, 
and by the uncertainty $\surd{\rm var}(n)$ of that estimate.
For many applications, I prefer to keep options open by omitting the maximum number of atoms and setting the minimum to 1.
Then {\tt Alpha} can be set to the number of atoms I expect to see, 
usually made negative because the geometric distribution is less committal than the Poisson ($n \pm n$ instead of $n \pm \surd n$).
However, the precise settings ought not to influence the results very much.
If they do, seek a reason because something may be amiss.

\bigskip
\noindent{5.1.2. \it Coordinates}
\smallskip

The last parameter needed to define the prior is {\tt Ndim}, the number of attributes per atom, otherwise known as the dimensionality~$d$. 
This has to be a positive integer.
Each attribute or coordinate is restricted to the range $[\,0,1]$, and the prior is uniform over the unit hypercube $[\,0,1]^d$.
The Hilbert-curve transformations of the hypercube are handled within BayeSys, and need not concern the applications programmer.
BayeSys will provide you with suggested positions for its atoms, 
and in fact each coordinate will always be an odd multiple of $2^{-33}$ (for a 32-bit word length).
You do not have to provide any positions yourself, but you can instruct your user-procedures to reject particular positions.
If you do this, BayeSys will omit those parts of the hypercube from your prior.

However, the BayeSys package includes a ``{\tt BayeShape}'' facility that will transform any or all of its {\tt Cube} hypercube coordinates to different {\tt Coord} shapes for you.
For $m$ dimensions ($\le d$), these shapes are currently:

\centerline{\vbox{\vskip 4pt
\offinterlineskip
      \halign{&\vrule#&\strut\quad\hfil#\hfil\quad&\vrule#&\strut\quad#\quad\hfil&\vrule#& \strut\quad#\hfil\quad          &\vrule# & \strut\quad#\hfil\quad &\vrule#\cr
   \noalign{\hrule}                                                                                                                                                  \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
                      &      {\tt Shape}          &      & \ \ Description      &      &\qquad Prob({\tt Coord[i]})          &       & \ \  Range of $i$      &      \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
   \noalign{\hrule}                                                                                                                                                  \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
                      &            0              &      &   Permutation        &      & uniform on integers Perm$(0,\ldots,m-1)$&   & $0,\ldots,m-1$         &      \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
                      &            1              &      &   Positive orthant   &      & $\exp(-x)$ in $x>0$                 &       & $0,\ldots,m-1$         &      \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
                      &            2              &      &   Simplex volume     &      & uniform in $\sum x < 1$             &       & $0,\ldots,m-1$         &      \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
                      &            3              &      &   Simplex surface    &      & uniform on $\sum x = 1$             &       & $0,\ldots,m$           &      \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
                      &            4              &      &   Ordered            &      & uniform in $0<x_0<x_1<\ldots<1$     &       & $0,\ldots,m-1$         &      \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
                      &            5              &      &   Bell               &      & Normal(0,1) in $-\infty<x<\infty$   &       & $0,\ldots,m-1$         &      \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
                      &            6              &      &   Sphere volume      &      & uniform in $\sum x^2 < 1$           &       & $0,\ldots,m-1$         &      \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
                      &            7              &      &   Hemisphere surface &      &uniform on $\sum x^2 = 1,\quad x_m>0$&       & $0,\ldots,m$           &      \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
                      &            8              &      &   Sphere surface     &      & uniform on $\sum x^2 = 1$           &       & $0,\ldots,m$           &      \cr
            height2pt &           \omit           &      &         \omit        &      &         \omit                       &       &         \omit          &      \cr
   \noalign{\hrule}                                                                                                                                                  \cr
          }
       } }
\noindent The eight geometrical transformations $1,2,\ldots,8$ are each 1:1, but note that the three ``surface'' shapes (3,7,8) are embedded in one dimension higher, 
for which you need to supply extra length in your destination array.
The source hypercube has $2^m$ corners, at which the anisotropy of the transformation may become singular.  
The {\tt BayeShape} procedures keep such singularities separated and thereby diluted, to avoid un-necessarily gross local distortions.

\bigskip
{\tt Shape}=0:
 
\noindent  The output permutation in {\tt Coord} is simply the ranking order of the input {\tt Cube} coordinates.

\bigskip
{\tt Shape}=1:
 
\noindent  Each coordinate $x$ is transformed to $u = -\log x$, distributed as $\Pr(u)=e^{-u}$.
The origin remains unchanged and other corners of the hypercube recede harmlessly to infinity.

\bigskip
{\tt Shape}=2:

\noindent The output $u_0,\ldots,u_{m-1}$ from the previous transformation is distributed uniformly over each hyperplane $\sum u = r = {\rm constant}$,
with inward cumulant
$$
    \int_{0 < \Sigma u < r} \Pr(u)\,du = \int_0^r dx\,e^{-x}\,x^{m-1}/(m-1)! = P(m,r)\,,
$$
which is the incomplete gamma function $P$.
The desired output $v$ within the unit simplex should also be distributed uniformly over each hyperplane $\sum v = a = {\rm constant}$,
but with inward cumulant
$$
    \int_{0 < \Sigma v < a} \Pr(v)\,dv = a^m\,.
$$
To transform $u$ to $v$, we equate the cumulants, by setting
$$
    a = P(m,r)^{1/m},\qquad v_i=(a/r)u_i\,.
$$
This makes $v_0,\ldots,v_{m-1}$ uniform over the interior of the unit simplex.
The origin of the source hypercube remains unchanged, its $m$ adjacent corners go to the other vertices of the simplex,
and the remaining $2^m - m - 1$ corners are distributed over the simplex roof $\sum v = 1$.

\bigskip
{\tt Shape}=3:

\noindent The $m$ numbers $v_0,\ldots,v_{m-1}$ from the previous transformation have $\Pr(v) = m! = {\rm constant}$ over $\sum v < 1$.
Define
$$
    v_m = 1 - \sum_{i=0}^{m-1} v_i
$$
so that
$$
    \Pr(v_m \mid v_0,\ldots,v_{m-1}) = \delta\Big(v_m - (1 - \sum_{i=0}^{m-1} v_i)\Big)\,.
$$
Hence
$$
    \Pr(v_0,\ldots,v_m) = \Pr(v_0,\ldots,v_{m-1})\,\Pr(v_m \mid v_0,\ldots,v_{m-1}) = m!\,\delta\Big(1 - \sum_{i=0}^m v_i\Big)\,.
$$
These are $m+1$ numbers uniformly distributed over the simplex surface $\sum v = 1$, as required.
The origin of the source hypercube maps to $(0,0,0,\ldots,0,1)$, 
and the $m$ adjacent corners map to the other vertices $(1,0,0,\ldots,0,0),(0,1,0,\ldots,0,0),\cdots,(0,0,0,\ldots,1,0)$.
The remaining $2^m - m - 1$ corners lie elsewhere on the face $v_m = 0$.
Although sampling is correctly uniform, and in 1:1 correspondence with the source hypercube, 
the local shape of the transformation does not treat the $m+1$ destination axes symmetrically.
It is not possible to transform a cube to a simplex in a way that has the full symmetry of the simplex.

\bigskip
{\tt Shape}=4:
 
\noindent Starting again with the $m$ numbers $v_0,\ldots,v_{m-1}$, define
$$
    w_j = \sum_{i=0}^j v_i
$$
so that $0 < w_0 < w_1 < w_2 < \cdots < w_{m-1} < 1$.
The Jacobian $\partial(w)/\partial(v)$ is 1, so
$$
    \Pr(w) = \Pr(v)\,\partial(v)/\partial(w) = m! = {\rm constant}.
$$
Hence the $w$ are $m$ numbers, ordered but otherwise uniformly distributed in (0,1).
The mapping is 1:1 from the source hypercube, which encourages efficient exploration.
It is {\bf not} the $m!:1$ mapping that would result from merely sorting $m$ independent numbers.
The origin of the source hypercube maps to $(0,0,0,\ldots,0,0)$, 
and the $m$ adjacent corners map to the other vertices $(0,0,0,\ldots,0,1),(0,0,0,\ldots,1,1),\cdots,(1,1,1,\ldots,1,1)$ of the skew simplex containing $w$.
The remaining $2^m - m - 1$ corners lie elsewhere on the face $w_{m-1} = 1$.

\bigskip
{\tt Shape}=5: 

\noindent Each coordinate $x$ is transformed by the inverse normal cumulant ${\cal N}^{-1}$ to
$$
   u = {\cal N}^{-1}(x),\qquad {\cal N}(u) \equiv (2\pi)^{-1/2} \int_{-\infty}^u e^{-t^2/2}dt = x\,.
$$
The points $u$ cover all space, and
$$
   \Pr(u) = \Pr(x)\,{\partial(x)\over\partial(u)} = \prod_{i=0}^{m-1} (2\pi)^{-1/2} \exp(-u_i^2/2) = (2\pi)^{-m/2} e^{-r^2/2}
$$
where $r^2 = \sum u_i^2$.
Hence the points $u$ have the required unit Gaussian distribution.
The cube centre $({1\over 2}, {1\over 2}, \ldots, {1\over 2})$ maps to the origin $u=0$, and the corners recede harmlessly to infinity along their respective directions.

\bigskip
{\tt Shape}=6: 

\noindent The output $u_0,\ldots,u_{m-1}$ from the previous transformation is distributed uniformly over each sphere $\sum u^2 = r^2 = {\rm constant}$,
with inward cumulant
$$
    \int_{0 < \Sigma u^2 < r^2} \Pr(u)\,du = P_{m/2}(r^2/2)\,,
$$
which is another incomplete gamma function $P$.
The desired output $v$ within the unit sphere is also to be distributed uniformly over each sphere $\sum v^2 = a^2 = {\rm constant}$,
but with inward cumulant
$$
    \int_{0 < \Sigma v^2 < a^2} \Pr(v)\,dv = a^m\,.
$$
To transform $u$ to $v$, we equate the cumulants, by setting
$$
    a = \big(P_{m/2}(r^2/2)\big)^{1/m},\qquad v_i = (a/r)u_i\,.
$$
This makes $v_0,\ldots,v_{m-1}$ uniform over the interior of the unit sphere.
The corners of the source hypercube are distributed over the sphere surface, with their original cubic symmetry.
This is much less distorting than attempting to use polar coordinates, which are badly anisotropic near the origin.

\bigskip
{\tt Shape}=7: 

\noindent The Gaussian cumulant $P_{m/2}(r^2/2)$ can alternatively be used to map radius $r$ to $\psi$, the polar angle on a hemisphere surface, 
which has inward (towards the pole) cumulant
$$
 {2 \over \surd\pi} {\Gamma({1\over2}(m+1)) \over \Gamma({1\over2}m)} \int_0^\psi \sin^{m-1}\xi\,d\xi\,.
$$
Equating the two cumulants determines $z_m = \cos\psi$, and also the norm $a = \sqrt{1 - z_m^2} = \sin\psi$ of the remaining coordinates,
from which 
$$
   z_i = (a/r)u_i. \qquad i=0,\ldots,m-1.
$$
The $m+1$ numbers $z_0,\ldots,z_m$ are uniformly distributed over the upper ($z_m\ge 0$) unit hemisphere.
All the corners of the source hypercube are on the equator $z_m=0$, distributed with their original cubic symmetry.

\bigskip
{\tt Shape}=8: 

\noindent The surface of a complete sphere poses a difficulty in that there is no natural edge where the hypercube corners could be sent.
If all the corners are sent to a single location at the lower ``south'' pole, the transformation there becomes extremely anisotropic.
Instead, I keep the hemisphere mapping as before, 
but select upper or lower hemisphere according to the parity (0 or 1) of the Hilbert integer encoding the source coordinates.
The advantage of this is that the hypercube corners remain well spread out round the equator $z_m=0$.
The disadvantage is that exploration will be slower because about half the trial points will be in the wrong hemisphere,
because the upper and lower hemispheres are point-to-point adjacent.
There would also be trouble if an over-intelligent user tried to hide some information of his/her own in this parity bit of lowest arithmetical significance,
because that information would be misinterpreted without warning.

\vfill\eject
\noindent{5.2. THE BAYESYS ENGINES}
\smallskip

BayeSys has several MCMC exploration algorithms or ``engines'', currently LifeStory1, LifeStory2, \hbox{GuidedWalk}, Leapfrog1, Leapfrog2, Chameleon1 and Chameleon2.
The LifeStory engines control the birth and death of atoms, and allow their movement along the Hilbert curve.
The GuidedWalk and Leapfrog engines allow atoms to move geometrically within the coordinate hypercube, 
thus letting the ensemble learn about and use the shape of the likelihood function.
The Chameleon engines let atoms jump between the objects in their ensemble, thus allowing the objects to communicate more directly.
The description here follows Skilling (2004b).

You can control the engines by setting the 7-bit-integer input parameter {\tt Method} as follows.
$$
\hbox
{\vbox{\hbox{\tt Method = }\vskip 20pt}}
{\vbox{
 \offinterlineskip
 \halign{& \vrule# &\ \hfil#\hfil\ &\ \ \hfil#\hfil\ \ &\ \hfil#\hfil\ &\ \hfil#\hfil\ &\ \hfil#\hfil\ &\ \hfil#\hfil\ & \ \hfil#\hfil\ &\vrule# &\ #\hfil \strut\cr
                \omit   &  high    &         &          &          &          &          &   low   &\omit&        \cr
                                                  \multispan8\hrulefill                                           \cr
              height2pt & \omit    & \omit   & \omit    & \omit    & \omit    & \omit    & \omit   &     & \omit  \cr
                        &GuidedWalk&Leapfrog2& Leapfrog1&Chameleon2&Chameleon1&LifeStory2&Hilbert  &     &  1 ON  \cr
              height2pt & \omit    & \omit   & \omit    & \omit    & \omit    & \omit    & \omit   &     & \omit  \cr
                        &  off     &  off    &  off     &  off     &  off     &LifeStory1&raster   &     &  0 OFF \cr
              height2pt & \omit    & \omit   & \omit    & \omit    & \omit    & \omit    & \omit   &     & \omit  \cr
                                                  \multispan8\hrulefill                                           \cr
                 \omit  &  64      &  32     &  16      &   8      &   4      &   2      &   1     &\omit&        \cr
        }
      }
}
$$
The lowest bit of {\tt Method} switches between topologies.
If the bit is ON the Hilbert curve is used, otherwise a simple raster is used (with attribute coordinates $0,1,2,\ldots$ taking progressively lower precedence).
In one dimension, the two topologies are equivalent, so the switch has no effect.
Generally, I recommend using the Hilbert topology unless you have a specific contrary need.

The next-to-lowest bit of {\tt Method} switches between LifeStory1 and its more sophisticated form LifeStory2.
These two are the only engines in BayeSys that are guaranteed to mix all the states irreducibly, so one of them must be present, 
and you are given no way of switching them both off.
Generally, I recommend the extra power of LifeStory2, although its iterations are more expensive and for a few applications it is inapplicable.

The highest five bits of {\tt Method} switch the other individual engines on or off.
For example, {\tt Method}=27 (= 16+8+2+1) uses Leapfrog1 and Chameleon2, with LifeStory2 but not LifeStory1, in the context of Hilbert topology.
Future development may yield new engines switched by higher bits.

None of the {\tt Method} settings should change the ultimate statistical properties of the results, because BayeSys always aims to explore the posterior.
It is only the efficiency of that exploration which changes.
I suggest the strategy of developing an application with all bits ON ({\it i.e.} ${\tt Method} = 127$ or equivalently ${\tt Method} = -1$).
Then, if you need to save computer resources, try switching bits OFF as long as your results are undamaged.
Remember that you can control the speed of annealing with parameter {\tt Rate}.

All the engines operate in the environment of an ensemble of $\cal N$ objects (input as parameter {\tt ENSEMBLE}),
each being a sample from the posterior distribution, as currently annealed.
The LifeStory engines operate on just one object at a time, but the others can use two or even three.
To control this, we consider the ensemble to be a single super\-system
$\Theta = \{ \theta_1, \theta_2, \ldots, \theta_{\cal N} \}$
with its ensemble prior
$$
  \pi(\Theta) = \pi(\theta_1) \pi(\theta_2) \cdots \pi(\theta_{\cal N})
$$
and its ensemble likelihood
$$
  {\cal L}(\Theta) = L(\theta_1) L(\theta_2) \cdots L(\theta_{\cal N})
$$
giving its ensemble posterior
$$
  {\cal P}(\Theta) = P(\theta_1) P(\theta_2) \cdots P(\theta_{\cal N})
$$
which factorises cleanly into the required posteriors of the constituent objects.
Equilibrating just one object at a time is like exploring $\Theta$ by simple Gibbs sampling, but the supersystem allows more mixed exploration.
Dimensionality need not be a curse; here it is an opportunity.

There is a reason for having several engines.
Any individual engine can be thwarted by a particular form of likelihood.
Even if provably convergent, it may be impractically slow.
A different engine might overcome the defect, only to be thwarted elsewhere.
The combination of all engines will only be defeated by likelihoods that defeat every one.
BayeSys currently runs with up to six engines, and roughly balances its computation time between them.
At worst, the cost of running six engines instead of the best alone is only a factor of six.
The payoff comes when one of the engines rescues the system from a location that would trap the others.

As a technicality, balancing run-times on the basis of results achieved amounts to a derogation from strict detailed balance.
Accordingly, the balancing is only tuned during annealing, during which stage the ensemble is bound to be slightly away from equilibrium anyway.
During exploration at fixed (usually unit) coolness, the engine ratios are frozen.

We give no general guarantee of convergence.
A sufficiently perverse likelihood will defeat any engine, even if composite, and you may never be aware of it.
``Most'' conceivable likelihoods will defeat any engine we are ever likely to build!
But the same could be said of integers, which we happily use despite ``most'' of them being unusably large.
Our {\bf actual} applications involve data that we feel will be interpretable, otherwise the data would not have been collected in the first place.
We specialise in soluble problems!

\bigskip
\noindent{5.2.1. \it LifeStory1}
\smallskip

The LifeStory1 engine operates on just one object in the ensemble,
and combines the processes of birth, death, and movement -- hence the name -- in a rather natural way.
As explained in section 3.1, the transition scheme for exploring the prior number $n$ of atoms uses a birth rate $\beta_n$ and death rate $n$ per unit artificial time.
When an event occurs, being birth or death in ratio $\beta_n : n$, the atomic number is to be incremented or decremented as appropriate, 
subject to Metropolis-Hastings acceptance to ensure detailed balance.
$$
           \raise  0pt\vbox{\offinterlineskip \hrule \halign{\hfil #\hfil\strut\cr \vrule\    $n-$1     \vrule \cr} \hrule}
\hskip10pt \raise 16pt\DeathArrow \hskip10pt 
           \raise 24pt\vbox{\offinterlineskip \hrule \halign{\hfil #\hfil\strut\cr \vrule\ \ \ $n$\ \ \ \vrule \cr} \hrule}
\hskip10pt \raise 32pt\BirthArrow \hskip10pt
           \raise 48pt\vbox{\offinterlineskip \hrule \halign{\hfil #\hfil\strut\cr \vrule\    $n+$1     \vrule \cr} \hrule}
$$
More slowly, but with equal validity, we can change the atomic number with probability 50\%, leaving the indeterminate composite to be resolved afterwards.
$$
\hbox{\lower 12pt \vbox{\hbox{\ $n$} \hbox{\ or} \hbox{$n-$1} }}
\longleftarrow
\left\{\ \lower 16pt\vbox{\offinterlineskip \hrule \halign{  \hfil # \hfil                    \strut\cr
                                                           \vrule\ \phantom{1}$n\phantom{-}$ \vrule \cr
                                   \noalign{\hrule}         and                                     \cr
                                   \noalign{\hrule}        \vrule\ $n-$1 \vrule                     \cr }
                                            \hrule
                         }\ \right\}
\hskip10pt \raise 2pt\DeathArrow \hskip10pt
\raise  8pt\vbox{\offinterlineskip \hrule \halign{   \hfil # \hfil      \strut\cr
                                                  \vrule\ \ \ $n$\ \ \ \vrule \cr }
                                   \hrule
                }
\hskip10pt \raise 14pt\BirthArrow \hskip10pt
\raise 24pt\hbox{$
\left\{\ \lower 16pt\vbox{\offinterlineskip \hrule \halign{  \hfil # \hfil                    \strut\cr
                                                           \vrule\ $n+$1 \vrule                     \cr
                                   \noalign{\hrule}         and                                     \cr
                                   \noalign{\hrule}        \vrule\ \phantom{1}$n\phantom{+}$ \vrule \cr }
                                            \hrule }
\ \right\}
\longrightarrow
\hbox{\lower 12pt \vbox{\hbox{$n+1$} \hbox{\ or} \hbox{\ $n$} }
}
$}
$$
With likelihood factors $L_j$ for $j$ atoms, we proceed to sample from a birth composite, which has likelihood $L_{\rm birth} = {1 \over 2}(L_n+L_{n+1})$,
according to the relative individual likelihoods:
$$
\eqalign{
         \Pr(n+1                       \mid \hbox{birth composite}) &= L_{n+1}           / (L_n + L_{n+1}), \cr
         \Pr(\ \phantom{+}n\phantom{1} \mid \hbox{birth composite}) &= L_{n\phantom{+1}} / (L_n + L_{n+1}). \cr
        }
$$
Likewise for a death composite of likelihood $L_{\rm death} = {1 \over 2}(L_n+L_{n-1})$:
$$
\eqalign{
         \Pr(\ \phantom{-}n\phantom{1} \mid \hbox{death composite}) &= L_{n\phantom{-1}} / (L_n + L_{n-1}), \cr
         \Pr(n-1                       \mid \hbox{death composite}) &= L_{n-1}           / (L_n + L_{n-1}). \cr
        }
$$

When atoms have attributes, birth of an atom is accompanied by selection of a random coordinate $x$,
and death is accompanied by selection of a random identifiable-by-location atom, as flagged by downward arrows in the diagram below.
$$
\hbox{$
       \hbox{\lower 12pt \vbox{\hbox{\ $n$} \hbox{\ or} \hbox{$n-$1}} }
       \longleftarrow
       \Bigg\{ \lower 12pt\vbox{ \hbox{\hskip 49pt $\scriptstyle\downarrow$}
                                 \LineThree
                                 \hbox{\hskip 37pt Death}
                                 \LineTwo } \Bigg\}
     $}
\hskip10pt \raise 2pt\DeathArrow \hskip10pt
\raise 12pt\LineThree
\hskip10pt \raise 14pt\BirthArrow \hskip10pt
\raise 24pt\hbox{$
                  \Bigg\{ \lower 12pt\vbox{ \hbox{\hskip 24pt $\scriptstyle\downarrow$}
                                            \LineFour
                                            \hbox{\hskip 15pt Birth}
                                            \LineThree } \Bigg\}
                  \longrightarrow
                  \hbox{\lower 12pt \vbox{\hbox{$n$+1} \hbox{\ or} \hbox{\ $n$}} }
                $}
$$
The likelihoods are promoted to functions, of position of the atom being born or killed in the presence of the other atoms,
and the reason for introducing intermediate composite states now appears.
The new position, whether that selected for birth or that of the atom under sentence of death,
need not be immediately accepted or rejected by Metropolis-Hastings on the basis of that position only.
Instead, movement is almost guaranteed by using binary slice sampling to adjust the composite state's position, 
using whichever of $L_{\rm birth}$ and $L_{\rm death}$ is appropriate.
Slice sampling centres on the original selected position,
and for efficiency it is carried out between the left and right neighbour atoms of that initial choice.
$$
\hbox{
\lower 12pt\vbox{\hbox{\ $n$}
                 \hbox{\ or}
                 \hbox{$n-$1}}
                }
\longleftarrow
\Bigg\{
\lower 12pt\vbox{ \hbox{\hskip 2pt {$\scriptstyle\rm Left$} \hskip 29.5pt $\scriptstyle\downarrow$ \hskip 6pt {$\scriptstyle\rm Right$}}
                  \LineThree
                  \hbox{\hskip 6pt \hbox to 25pt{\leftarrowfill} Death $\rightarrow$}
                  \LineTwo
                }
\Bigg\}
\hskip10pt \raise 2pt\DeathArrow \hskip10pt
\raise 12pt\LineThree
\hskip10pt \raise 14pt\BirthArrow \hskip10pt
\raise 24pt\hbox{$
                  \Bigg\{
                  \lower 12pt\vbox{ \hbox{\hskip 2pt {$\scriptstyle\rm Left$} \hskip 5pt $\scriptstyle\downarrow$ \hskip 10pt {$\scriptstyle\rm Right$}}
                                    \LineFour
                                    \hbox{\hskip 6pt $\leftarrow\,$Birth$\,\rightarrow$}
                                    \LineThree
                                  }
                  \Bigg\}
                  \longrightarrow
                  \hbox{\lower 12pt \vbox{ \hbox{$n$+1}
                                           \hbox{\ or}
                                           \hbox{\ $n$}}
                                         }
                $}
$$

The birth or death of a complete atom might well cause significant change in the likelihood,
and it is wise to couple the opportunity with a wider view of the possibilities by allowing movement.
If a birth event succeeds in incrementing the number of atoms, all well and good.
A change has been made, and slice sampling has likely improved the suggested location.
If it fails, then there has been no change, which is regrettable.
If a death event succeeds in decrementing the number of atoms, a change has been made.
If not, then at least the selected atom will almost certainly have been moved.
So, overall, LifeStory1 should make useful changes at least half the time.

The illustration below illustrates the range allowed to a selected location $X$, 
constrained to move along the Hilbert curve (with origin randomised to $(5,7)$ on a wraparound $2^6 \times 2^6$ grid) between its left and right neighbours $L$ and~$R$.
With respect to the coordinates, this range is very roughly circular out to the distance of the neighbours.
$$
\hbox
{
\vbox{\lineskip = 0pt \baselineskip = 0pt
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\j\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z}
      \hbox{\z\z\z\o\z\j\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z}
      \hbox{\z\o\z\j\l\o\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l}
      \hbox{\n\j\j\z\o\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o}
      \hbox{\z\j\l\j\l\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\z\o\z\z\o\z\z\o\j\Z\O\Z\O\Z\O\Z\O\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\z\o\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\I\L\I\I\I\L\I\I\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\L\O\Z\I\L\O\Z\I\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\j\Z\I\L\Z\Z\I\L\O\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\j\l\z\j\l\z\j\j\j\l\z}
      \hbox{\z\z\z\o\z\j\l\o\z\z\z\o\z\j\l\o\z\z\z\o\z\j\I\Z\Z\O\Z\Z\O\I\Z\O\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\l\o\z\z\z\o\z\j\l\o\z}
      \hbox{\z\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\L\I\Z\I\L\O\L\I\I\n\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\z\j\l}
      \hbox{\n\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\Z\O\L\O\Z\I\Z\O\L\O\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\j\z\o}
      \hbox{\z\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\I\L\Z\I\L\Z\I\L\Z\I\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\l\j\l}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\n\O\Z\O\Z\O\I\Z\Z\O\Z\Z\O\Z\Z\O\Z\Z\O\Z\Z\O\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\o\I\I\L\I\I\L\I\Z\I\L\O\L\I\Z\I\L\O\L\I\Z\I\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\Z\I\L\O\Z\I\Z\O\L\O\Z\I\Z\O\I\Z\O\I\Z\O\L\O\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\L\Z\Z\I\L\O\I\L\Z\I\L\Z\I\I\L\I\L\I\I\L\Z\I\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\z\j\l\z\z\j\l\o\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\Z\Z\O\I\L\O\Z\Z\Z\O\Z\I\n\O\Z\O\I\Z\Z\O\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\j\z\z\o\z\z\o\j\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\L\O\L\I\Z\I\L\O\Z\I\L\O\n\L\I\I\L\I\Z\I\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\Z\I\Z\O\I\Z\O\I\I\Z\O\I\L\O\Z\I\Z\O\L\O\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\L\Z\I\I\L\I\L\I\L\I\L\I\Z\I\L\Z\I\L\Z\I\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\l\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\Z\Z\O\Z\Z\O\Z\I\Z\O\Z\O\Z\O\Z\O\L\O\Z\Z\O\Z\Z\O\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\L\O\L\I\Z\I\L\O\I\L\I\I\I\L\I\I\Z\I\L\O\L\I\Z\I\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\Z\I\Z\O\I\Z\O\I\L\O\Z\I\L\O\Z\I\I\Z\O\I\Z\O\L\O\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\L\Z\I\I\L\I\L\I\Z\I\L\Z\Z\I\L\O\L\I\L\I\I\L\Z\I\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\Z\Z\O\I\Z\O\Z\O\I\Z\Z\O\Z\Z\O\I\Z\O\Z\O\I\Z\Z\O\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\L\O\L\I\I\L\I\I\L\I\Z\I\L\O\L\I\I\L\I\I\L\I\Z\I\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\Z\I\Z\O\L\O\Z\I\Z\O\L\O\Z\I\Z\O\L\O\Z\I\Z\O\L\O\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\z\z\j\l\z\j\l\z\j\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n}
     }
\hskip -177.6pt \vbox{\hbox{$\bigcirc$ \hskip -11.5pt ${\scriptstyle R}$} \vskip116.2pt \hbox{ } }
\hskip  -86.4pt \vbox{\hbox{$\bigcirc$ \hskip -11.0pt ${\scriptstyle L}$} \vskip100.0pt \hbox{ } }
\hskip   58.9pt \vbox{\hbox{$\bigcirc$ \hskip -11.7pt ${\scriptstyle X}$} \vskip 80.2pt \hbox{ } }
\hskip  172pt
}  
$$

\vfill\eject
\noindent{5.2.2. \it LifeStory2}
\smallskip

The LifeStory2 engine, like LifeStory1, operates on just one object, and combines the processes of birth, death, and movement.
The difference is that one of the neighbouring atoms is also allowed to move.
Because they are indivisible changes, birth and death are often discriminated against by the associated changes in likelihood.
The existing $n$ atoms may have equilibrated as best they can, so that inserting or deleting a complete atom might always be unlikely if the other atoms are all fixed,
even though global re-equilibration with $n\pm 1$ atoms might make the change acceptable or even preferable.
In this situation, LifeStory1 is thwarted. 

LifeStory2 allows a neighbouring atom to move aside to make room for the insertion, or move closer to compensate the deletion,
so that the engine can jump around the barrier and equilibrate the number of atoms more effectively.
In this way, a locally dominant constraint (such as mean position or total flux) can remain satisfied even while an atom is created or destroyed.
Richardson \& Green (1997) discuss ``splitting and combination moves'' that are akin to LifeStory2, but restricted to using defined constraints.

$$
\lower 12pt\vbox{ \hbox{\ $n$}
                  \hbox{\ or}
                  \hbox{$n-$1}
                }
\longleftarrow
\Bigg\{
\lower 12pt\vbox{\hbox{\hskip 2pt {$\scriptstyle\rm Left$} $\scriptstyle\bigtriangledown$ \hskip 34.7pt $\scriptstyle\downarrow$ \hskip 6pt {$\scriptstyle\rm Right$}}
                 \LINEfour
                 \hbox{\hskip 6pt \hbox to 39pt{\leftarrowfill} Death $\rightarrow$}
                 \LINEthree
                }
\Bigg\}
\hskip 10pt \raise 2pt \DeathArrow \hskip 10pt
\raise 12pt\LINEfour
\hskip 10pt \raise 14pt\BirthArrow \hskip 10pt
\raise 24pt \hbox{$
                   \Bigg\{
                   \lower 12pt\vbox{\hbox{\hskip 2pt {$\scriptstyle\rm Left$} $\scriptstyle\bigtriangledown$
                                                                              \hskip 9.5pt $\scriptstyle\downarrow$ \hskip 10pt {$\scriptstyle\rm Right$}}
                                    \LINEfive
                                    \hbox{\hskip 6pt \hbox to 22pt{\leftarrowfill} Birth$\,\rightarrow$}
                                    \LINEfour
                                   }
                   \Bigg\}
                   \longrightarrow
                   \hbox{\lower 12pt \vbox{ \hbox{$n$+1}
                                            \hbox{\ or}
                                            \hbox{\ $n$}}
                                          }
                 $}
$$
The diagram shows the leftward neighbour ``$\bigtriangledown$'' of the selected birth or death position ``$\downarrow$'' being included in the process:
inclusion of the rightward neighbour would be equally likely.
In the upper state of either composite there are now two atoms that can be moved around, and in the lower state there is one.
Binary slice sampling can equally well be carried out on two coordinates as on one.
It centres on the original selected positions,
and for efficiency it is done between the left and right neighbour atoms of those initial choices, as shown by the horizontal range arrows.
$$
\hbox
{
\vbox{\lineskip = 0pt \baselineskip = 0pt
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\j\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z}
      \hbox{\z\z\z\o\z\j\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z}
      \hbox{\z\o\z\j\l\o\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l}
      \hbox{\n\j\j\z\o\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o}
      \hbox{\z\j\l\j\l\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\z\o\z\z\o\z\z\o\j\Z\O\Z\O\Z\O\Z\O\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\z\o\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\I\L\I\I\I\L\I\I\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\L\O\Z\I\L\O\Z\I\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\j\Z\I\L\Z\Z\I\L\O\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\j\l\z\j\l\z\j\j\j\l\z}
      \hbox{\z\z\z\o\z\j\l\o\z\z\z\o\z\j\l\o\z\z\z\o\z\j\I\Z\Z\O\Z\Z\O\I\Z\O\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\l\o\z\z\z\o\z\j\l\o\z}
      \hbox{\z\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\L\I\Z\I\L\O\L\I\I\n\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\z\j\l}
      \hbox{\n\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\Z\O\L\O\Z\I\Z\O\L\O\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\j\z\o}
      \hbox{\z\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\I\L\Z\I\L\Z\I\L\Z\I\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\l\j\l}
      \hbox{\z\o\z\o\z\o\z\o\z\o\Z\O\Z\O\Z\O\n\O\Z\O\Z\O\I\Z\Z\O\Z\Z\O\Z\Z\O\Z\Z\O\Z\Z\O\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\I\L\I\I\I\L\O\I\I\L\I\I\L\I\Z\I\L\O\L\I\Z\I\L\O\L\I\Z\I\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\L\O\Z\I\L\O\Z\I\L\O\Z\I\Z\O\L\O\Z\I\Z\O\I\Z\O\I\Z\O\L\O\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\n\Z\Z\I\L\O\Z\I\L\Z\Z\I\L\O\I\L\Z\I\L\Z\I\I\L\I\L\I\I\L\Z\I\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\z\j\l\z\z\j\l\o\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\Z\Z\O\I\I\Z\Z\O\Z\Z\O\I\L\O\Z\Z\Z\O\Z\I\n\O\Z\O\I\Z\Z\O\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\j\z\z\o\z\z\o\j\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\L\O\L\I\L\I\Z\I\L\O\L\I\Z\I\L\O\Z\I\L\O\n\L\I\I\L\I\Z\I\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\Z\I\Z\O\Z\O\L\O\Z\I\Z\O\I\Z\O\I\I\Z\O\I\L\O\Z\I\Z\O\L\O\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\L\Z\I\L\I\L\Z\I\L\Z\I\I\L\I\L\I\L\I\L\I\Z\I\L\Z\I\L\Z\I\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\l\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\Z\Z\O\Z\Z\O\Z\I\Z\O\Z\O\Z\O\Z\O\L\O\Z\Z\O\Z\Z\O\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\L\O\L\I\Z\I\L\O\I\L\I\I\I\L\I\I\Z\I\L\O\L\I\Z\I\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\Z\I\Z\O\I\Z\O\I\L\O\Z\I\L\O\Z\I\I\Z\O\I\Z\O\L\O\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\L\Z\I\I\L\I\L\I\Z\I\L\Z\Z\I\L\O\L\I\L\I\I\L\Z\I\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\Z\Z\O\I\Z\O\Z\O\I\Z\Z\O\Z\Z\O\I\Z\O\Z\O\I\Z\Z\O\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\L\O\L\I\I\L\I\I\L\I\Z\I\L\O\L\I\I\L\I\I\L\I\Z\I\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\Z\I\Z\O\L\O\Z\I\Z\O\L\O\Z\I\Z\O\L\O\Z\I\Z\O\L\O\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\z\z\j\l\z\j\l\z\j\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\L\Z\I\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n}
     }
\hskip -177.6pt \vbox{\hbox{$\bigcirc$ \hskip -11.5pt ${\scriptstyle R}$} \vskip116.2pt \hbox{ } }
\hskip  -86.2pt \vbox{\hbox{$\bigcirc$ \hskip -11.4pt ${\scriptstyle Y}$} \vskip100.2pt \hbox{ } }
\hskip  -51.8pt \vbox{\hbox{$\bigcirc$ \hskip -11.4pt ${\scriptstyle F}$} \vskip 85.8pt \hbox{ } }
\hskip   98.7pt \vbox{\hbox{$\bigcirc$ \hskip -11.7pt ${\scriptstyle X}$} \vskip 80.2pt \hbox{ } }
\hskip  172pt
}  
$$
In the above 2-dimensional illustration,
both $X$ and its left neighbour (now called $Y$) can move between the further-left neighbour $F$ and the original right neighbour $R$,
allowing synergy between $X$ and $Y$ across a somewhat greater range.

If a birth event succeeds in incrementing the number of atoms, a good change has been made, with likely improvement to the suggested random location,
and compensatory movement of a neighbour.
Even if it fails, the neighbour will almost certainly have moved, so some change will have occurred.
If a death event succeeds in decrementing the number of atoms, a neighbour will have moved to compensate,
allowing the change to be more sympathetic to the data (hence more favoured).
If not, then at least the selected atom and a neighbour will almost certainly have both moved.
So, overall, LifeStory2 should almost always make useful changes, and thereby should be at least twice as powerful as LifeStory1.

One can envisage generalising LifeStory2 by allowing yet more atoms to move.
However, bringing in more atoms will expand the domain they cover, and may well bring in additional constraints.
Slice sampling would have to work harder to find an acceptable pattern, and the movements would be correspondingly smaller.
My judgment is that LifeStory2 is often likely to be about best.

\bigskip
\noindent{5.2.3. \it GuidedWalk}
\smallskip

The LifeStory engines explore a very roughly spherical domain about one or two selected positions.
Especially in many dimensions, where the likelihood function is increasingly capable of constraining an atom anisotropically, this can be inefficient.
An atom is allowed to move only a short distance along strongly-constrained directions, 
and this restriction carries over to weakly-constrained directions if exploration is isotropic, making their exploration slow.
As it happens, the number of attributes (dimensions) ascribed to an atom is often small, so the mere fact of using an atomic prior much reduces the difficulty:
we seek to control only one or two atoms at a time, not the entire object.
Even so, it may be useful to avoid the inefficiency by using the local shape of the likelihood function.
In ``classical analogue'' style, high-order methods such as conjugate gradient for maximisation, 
and hybrid Monte Carlo (Duane {\it et.al.} 1987, Neal 1993) for probabilistic exploration, come to mind.
These methods use a sequence of intermediate evaluations (of gradient as well as value of likelihood) to discover and incorporate the local shape.

Alternatively, we could simply use the ensemble.
Let $X$ be a randomly-selected atom that we wish to move, in accordance with the local likelihood function.
We may presume that the existence of $X$ at its particular location already represents some feature of the likelihood, 
in which case other objects should also have atoms in similar location representing the same feature.
Let $L$ and $R$ be left and right neighbour atoms of the position of $X$, drawn from different objects.
If, indeed, each object contains just one atom for the feature in question, 
then that will lie fairly closely left of $X$ along the Hilbert curve half the time, and fairly closely right of $X$ the other half.
So $L$ from its object should be the appropriate corresponding atom about half the time, and so should $R$ from its object.
Even if the precise correspondence and symmetry are relaxed, 
there should remain an appreciable ${\cal O}(1)$ probability ${\cal P}$ that $L$ and $R$ have a similar environment to~$X$.
The offset vector $({\bf R} - {\bf L})$ will then be obedient to the extent and shape of the local likelihood function.
Hence it can be suggested as an appropriate increment for the position of $X$, yielding a new trial location
$$
    {\bf X}^{\rm trial} = {\bf X} + s({\bf R} - {\bf L}), \qquad s = {\cal O}(1).
$$
Instead of a quasi-isotropic random walk, we have a {\bf guided} walk in the direction $\pm({\bf R} - {\bf L})$.

As usual, the scale $s$ of the change cannot be too large.
Larger values of $|s|$ increase the risk that the neighbours of $X^{\,\rm trial}$ will no longer be $L$ and $R$,
breaking detailed balance.
With randomly scattered atoms and $|s|=1$, the neighbours will only be correct with probability ${\cal O}(2^{-d})$, 
basically because there are $2^d$ quadrants in which other atoms might intervene.
That would limit $|s|$ to $1 \over 2$ or so, which reduces the potentially interfering atoms by the compensating volumetric factor $2^d$.
Moreover, if there are $\nu$ Gaussian constraints active on an atom, then $|s|$ is limited to about $\nu^{-1/2}$,
otherwise the trial location will usually be rejected.
However, if the local atoms do suffice to represent a local likelihood feature, then we might be allowed to go further, half way out to the next nearby feature.
It is hard to make this sort of argument precise, and fortunately we can use slice sampling to keep $s$ nearly optimal, whatever the restrictions.

Suppose there is one (or more) direction that is completely unconstrained, so that the likelihood function is extremely anisotropic.
At any stage in the computation, the ensemble will have its corresponding atoms $X,L,R,\ldots$ distributed somehow along this axis,
say with variance $\| \delta x \|^2 = \sigma^2$.
After a step of the guided walk with $s=\nu^{-1/2}$, the variance of the newly accepted $X^{\,\rm trial}$ will be $(1+2s^2)\sigma^2 = (1+2/\nu)\sigma^2$,
greater than before by a factor $1+2/\nu$.
Hence the spread of atoms along the unconstrained direction(s) increases {\bf exponentially}, 
with $\| \delta x \|$ gaining a factor something like $e^{\cal P}$ after $\nu$ ensemble-wide applications of the guided walk.
Exponential growth occurs because the algorithm is invariant under affine transformation, so it automatically adjusts to whatever scales are operative
(until supposedly-corresponding atoms become so distant that they can no longer be identified as such).
It would be too optimistic to expect to recognise exponential behaviour in realistic applications.
What we may see, though, is easy exploration of badly conditioned likelihood functions.

To implement GuidedWalk, we extend the vector $({\bf R} - {\bf L})$ across the unit hypercube, 
starting at the arbitrary origin of the current space-filling Hilbert curve
and continuing until the most-rapidly-changing coordinate ($\xi$ say) increments by 1.
On the digital $B$-bit grid, this defines a ``staircase'' of $2^B$ points parameterised by $\xi$.
This is displaced along the less-rapidly-changing axes until it passes through ${\bf X}$, as shown by the circles in the illustration below 
(note the horizontal wraparound of 5 points caused by the Hilbert origin being randomised to $(5,7)$).

$$
\hbox
{
\vbox{\lineskip = 0pt \baselineskip = 0pt
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\j\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z}
      \hbox{\z\z\z\o\z\j\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z}
      \hbox{\z\o\z\j\l\o\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l}
      \hbox{\n\j\j\z\o\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o}
      \hbox{\z\j\l\j\l\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\z\o\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\z\o\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\j\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\j\l\z\j\l\z\j\j\j\l\z}
      \hbox{\z\z\z\o\z\j\l\o\z\z\z\o\z\j\l\o\z\z\z\o\z\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\l\o\z\z\z\o\z\j\l\o\z}
      \hbox{\z\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\j\n\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\z\j\l}
      \hbox{\n\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\j\z\o}
      \hbox{\z\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\l\j\l}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\n\o\z\o\z\o\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\o\j\j\l\j\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\z\j\l\z\z\j\l\o\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\l\o\z\z\z\o\z\j\n\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\j\z\z\o\z\z\o\j\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\n\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\l\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n}
     }
\hskip-177.8pt \vbox{\hbox{$\bigcirc$ \hskip -11.5pt ${\scriptstyle R}$} \vskip115.8pt \hbox{ } }
\hskip -86.2pt \vbox{\hbox{$\bigcirc$ \hskip -11.0pt ${\scriptstyle L}$} \vskip100.0pt \hbox{ } }
\hskip  58.9pt \vbox{\hbox{$\bigcirc$ \hskip -11.7pt ${\scriptstyle X}$} \vskip 80.4pt \hbox{ } }
\hskip-155.0pt \vbox{\hbox{$\circ$}                                                                                                       \vskip115.4pt \hbox{ }}
\hskip-6.670pt \vbox{\hbox{                         $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$}   \vskip120.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$}   \vskip 60.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$}   \vskip 65.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\bullet$ \hskip-3.335pt $\bullet$ \hskip-3.335pt $\bullet$} \vskip 70.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\bullet$ \hskip-3.335pt $\bullet$ \hskip-3.335pt $\bullet$ \hskip-3.335pt $\bullet$ \hskip-3.335pt $\bullet$} \vskip 75.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\bullet$ \hskip-3.335pt $\bullet$ \hskip-3.335pt $\bullet$ \hskip-3.335pt $\bullet$                         } \vskip 80.4pt \hbox{ }}
\hskip-1.665pt \vbox{\hbox{$\bullet$ \hskip-3.335pt $\bullet$ \hskip-3.335pt $\bullet$ \hskip-3.335pt $\bullet$ \hskip-3.335pt $\bullet$} \vskip 85.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\bullet$ \hskip-3.335pt $\bullet$ \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$}   \vskip 90.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$}   \vskip 95.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$}   \vskip100.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$}   \vskip105.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$}   \vskip110.4pt \hbox{ }}
\hskip-3.335pt \vbox{\hbox{$\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$   \hskip-3.335pt $\circ$}                            \vskip115.4pt \hbox{ }}
}  
$$
Filled circles identify those points of the staircase which lie on that part of the Hilbert curve for which 
$L$ and $R$ remain the left and right neighbours in their own objects.
These are the trial points that are in detailed balance with $X$.
To select one, we use binary slice sampling along the staircase, which almost guarantees acceptable movement along what is appreciably often a useful direction.

\vfill\eject
\noindent{5.2.4. \it Leapfrog1 and Leapfrog2}
\smallskip

There are simpler ways of using the neighbours $L$ and $R$ of a selected atom $X$.
Leapfrog1 uses just one leftward or rightward neighbour, $L$ or $R$, preferably from an object different from $X$ though it need not be.
It takes
$$
   {\rm either} \qquad {\bf X}^{(1)} = 2{\bf L} - {\bf X} \qquad {\rm or} \qquad {\bf X}^{(1)} = 2{\bf R} - {\bf X}
$$
as a trial location, inverting $X$ through $L$ or $R$ without any tunable coefficient.
The trial location must be in detailed balance with $X$.
Hence if $L$ or $R$, in its member, is the left (right) neighbour of the location of $X$, 
then it must also be the right (left) neighbour of $X^{(1)}$ without any other intervening atoms.
Whether this is likely depends on how atoms are distributed locally.
Provided this neighbourhood condition holds, 
the Metropolis-Hastings rule can be used to accept or reject the suggestion on the basis of its likelihood relative to $X$.

Leapfrog2 uses two neighbours, $L$ to the left and $R$ to the right, preferably drawn from objects other than $X$, but they could be both from the same as $X$.
It takes
$$
   {\bf X}^{(2)} = {\bf L} + {\bf R} - {\bf X}
$$
as a trial location, inverting $X$ through the midpoint of $L$ and $R$ 
(which may be usefully closer to the centre of the local feature than either $L$ or $R$ individually).
The trial location will be in detailed balance if $L$ remains its leftward and $R$ remains its rightward neighbour.
Provided this neighbourhood condition holds, 
the Metropolis-Hastings rule can again be used to accept or reject the suggestion on the basis of its likelihood relative to $X$.
In the illustration below, Leapfrog1 can invert $X$ to either of the points marked \hbox{$\bigcirc$ \hskip -10.2pt ${\scriptstyle 1}$ \hskip 3pt},
and Leapfrog2 can invert $X$ to \hbox{$\bigcirc$ \hskip -10.2pt ${\scriptstyle 2}$ \hskip 3pt}.
$$
\hbox
{
\vbox{\lineskip = 0pt \baselineskip = 0pt
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\j\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z}
      \hbox{\z\z\z\o\z\j\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z}
      \hbox{\z\o\z\j\l\o\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l}
      \hbox{\n\j\j\z\o\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o}
      \hbox{\z\j\l\j\l\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\l\o\n\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\o\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\z\j\z\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\z\o\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\z\o\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\o\n\l\o\l\j\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\j\j\l\z\j\l\z\j\j\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\j\l\z\j\l\z\j\j\j\l\z}
      \hbox{\z\o\n\o\z\j\l\o\z\z\z\o\z\j\l\o\z\z\z\o\z\j\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\l\o\z\z\z\o\z\j\l\o\z}
      \hbox{\z\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\j\n\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\z\j\l}
      \hbox{\n\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\j\z\o}
      \hbox{\z\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\l\j\l}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\n\o\z\o\z\o\j\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\z\z\o\j\z\o\z\o\z\o\z\o\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\o\j\j\l\j\j\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\l\j\j\l\j\j\j\l\j\j\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\z\o\l\o\z\j\z\o\l\o\z\j\l\o\z\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\j\l\z\j\l\z\j\j\z\j\l\z\z\j\l\o\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\l\o\z\z\z\o\z\j\n\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\l\o\z\z\z\o\z\j\j\z\z\o\z\z\o\j\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\z\j\l\o\z\j\l\o\n\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\j\z\o\j\j\z\o\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\z\j\l\z\j\j\j\l\z\j\l\z\j\l\j\l\z\j\l\z\j\j\l\j\l\j\l\j\l\j\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\o\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\l\j\l\z}
      \hbox{\n\z\z\o\z\j\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o\z\z\o\z\j\z\o\z\o\z\o\z\o\l\o\z\z\o\z\z\o\z\z\o}
      \hbox{\z\j\z\j\l\o\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l\j\z\j\l\o\j\l\j\j\j\l\j\j\z\j\l\o\l\j\z\j\l\o\l}
      \hbox{\z\o\j\z\o\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z\o\j\z\o\j\l\o\z\j\l\o\z\j\j\z\o\j\z\o\l\o\z\j\z}
      \hbox{\n\j\l\j\l\j\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j\j\l\j\l\j\z\j\l\z\z\j\l\o\l\j\l\j\j\l\z\j\l\z\j}
      \hbox{\n\j\z\o\z\o\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o\j\z\o\z\o\j\z\z\o\z\z\o}
      \hbox{\z\j\j\l\j\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l\j\j\l\j\j\l\j\z\j\l\o\l}
      \hbox{\z\o\l\o\z\j\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z\o\l\o\z\j\z}
      \hbox{\n\l\z\j\l\z\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j\l\z\j}
      \hbox{\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z\o\z}
      \hbox{\n\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j\j\j\l\j}
      \hbox{\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z\j\l\o\z}
      \hbox{\z\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l\z\z\j\l\o\z\j\l}
      \hbox{\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z\o\z\z\o\j\j\z\z}
      \hbox{\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z\j\l\o\l\j\l\j\z}
      \hbox{\z\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l\o\z\j\z\o\z\o\l}
      \hbox{\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n\j\j\n\j\j\j\j\n}
     }
\hskip -172.5pt \vbox{\hbox{$\bigcirc$ \hskip -10.2pt ${\scriptstyle 1}$} \vskip150.3pt \hbox{ } }
\hskip  -86.6pt \vbox{\hbox{$\bigcirc$ \hskip -10.2pt ${\scriptstyle 2}$} \vskip136.0pt \hbox{ } }
\hskip  -85.0pt \vbox{\hbox{$\bigcirc$ \hskip -10.2pt ${\scriptstyle 1}$} \vskip120.4pt \hbox{ } }
\hskip  135.2pt \vbox{\hbox{$\bigcirc$ \hskip -11.5pt ${\scriptstyle R}$} \vskip116.2pt \hbox{ } }
\hskip  -86.4pt \vbox{\hbox{$\bigcirc$ \hskip -11.0pt ${\scriptstyle L}$} \vskip100.0pt \hbox{ } }
\hskip   58.9pt \vbox{\hbox{$\bigcirc$ \hskip -11.7pt ${\scriptstyle X}$} \vskip 80.2pt \hbox{ } }
\hskip  172pt
}  
$$

Like GuidedWalk, Leapfrog1 and Leapfrog2 are capable of increasing the spread of atoms along unconstrained directions {\bf exponentially}.
And, lacking the slice-sampling loop, they are simpler and faster to compute.
However, if the atoms are subject to several constraints, the suggested changes will usually be rejected.
Supposing that the local atoms are distributed in accordance with a locally Gaussian likelihood having $\nu$ constraints,
each atom's chisquared misfit should be $\chi^2 \sim \nu$.
But, according to the generating formulas, the misfits at the trial locations are likely to be larger: $5\nu$ for Leapfrog1 and $3\nu$ for Leapfrog2.
The net effect is that the acceptance rate drops exponentially with the number of constraints, 
roughly like $e^{-0.4\nu}$ for Leapfrog1 and $e^{-0.25\nu}$ for Leapfrog2.
This means that the Leapfrog engines can only operate well when the individual atoms have just a few constraints,
though that will certainly be true if their number of attributes $d$ is small.

No matter how long they are run for,
the Leapfrog engines can only move atoms around the discrete lattice of points being integer combinations of the original positions.
Whether or not this pattern of points is technically irreducible depends on Diophantine subtleties of 
whether the offsets in this lattice are co-prime in each dimension with the number of grid points, here $2^B$.
So the test of irreducibility may be less clear than one might suppose.
In practice, we play safe and consider the Leapfrog engines to lack the irreducible property, 
resolving always to use them alongside irreducible engines like LifeStory which do provide full exploration. 

\bigskip
\noindent{5.2.5. \it Chameleon1}
\smallskip

The Chameleon1 engine operates on two objects.
It tries to make a randomly chosen atom jump from one object to another.
$$
\vbox{\hbox{$\,n$ atoms}
      \hbox{\ }
      \hbox{$m$ atoms}
     }
\hskip 6pt
\vbox{\hbox{{\raise2pt\hbox to 140pt{\hrulefill}} \hskip -132pt $\bullet$ \hskip 36pt $\bullet$ \hskip 28pt $\bullet$ \hskip 13pt $\bullet$ \hskip 19pt}
      \hbox{\hskip 55.5pt $\downarrow$}
      \hbox{{\raise2pt\hbox to 140pt{\hrulefill}} \hskip -117pt $\bullet$ \hskip 21pt \hskip 8pt \hskip 18pt $\bullet$ \hskip 33pt $\bullet$ \hskip 9pt}
     }
\hskip 6pt
\vbox{\hbox{Source object}
      \hbox{\ }
      \hbox{Destination object}
     }
$$
The idea is that an atom might be better placed in an object other than its source, especially if a useful patch of the hypercube is only just being discovered.
Occasionally this could help the destination object out of a trap, so the engine could be worth including even if most of its transitions were rejected.
Chameleon1 cannot be used on its own because its transitions are not irreducible: 
it can only use existing atom positions and cannot generate all the other possibilities.
Hence it must be used with an irreducible engine such as LifeStory.

Care is needed with detailed balance.
Let the original ensemble state ``$i$'' have $n$ atoms in the source object and $m$ in the destination.
This occupancy has prior probability $\pi(n)\,\pi(m)$.
The destination state ``$j$'' will have $n-1$ and $m+1$ atoms respectively, with prior probability $\pi(n-1)\,\pi(m+1)$.
We require balance between the forward and backward transitions between these states.
$$
\lower 8pt\vbox{ \hbox{$\,n$ atoms}
                 \vskip 4pt
                 \hbox{\hskip 14pt \vbox{\offinterlineskip
                                         \hrule
                                         \halign{    & \vrule#&\strut\ \ \hfil#\hfil\ &\vrule#\cr
                                           height4pt &  \omit &                       & \omit \cr
                                                              &         $i$           &       \cr
                                           height4pt &  \omit &                       & \omit \cr
                                                }
                                         \hrule
                                        }
                      }
                 \hbox{$m$ atoms}
               }
\hskip 4pt
\vbox{ \hbox{${\buildrel     T_{ji}\,\pi(n)\,\pi(m)                      \over      {\hbox to 100pt{\rightarrowfill}}}     $}
       \hbox{${\buildrel {\displaystyle{\hbox to 100pt{\leftarrowfill}}} \over {\scriptstyle{T_{ij}\,\pi(n-1)\,\pi(m+1)}}} $}
     }
\hskip 4pt
\lower 8pt\vbox{ \hbox{$\,n-$1 atoms}
                 \vskip 4pt
                 \hbox{\hskip 10pt \vbox{\offinterlineskip
                                         \hrule
                                         \halign{    & \vrule#&\strut\ \ \hfil#\hfil\ &\vrule#\cr
                                           height4pt &  \omit &                       & \omit \cr
                                                              &         $j$           &       \cr
                                           height4pt &  \omit &                       & \omit \cr
                                                }
                                         \hrule
                                        }
                      }
                  \hbox{$m+$1 atoms}
                }
$$
Hence the transition ratios must obey
$$
   {T_{ji} \over T_{ij}} = {\pi(n-1)\,\pi(m+1) \over \pi(n)\,\pi(m)} = {n \over \beta_{n-1}}\,{\beta_m \over m+1}
$$
(remembering the birth and death rates for faithfully sampling the prior).  This is implemented with a forwards rate
$$
    T_{ji} = n \beta_m\,dt
$$
in infinitesimal interval $dt$ of artificial time, implying the balancing backwards rate $T_{ij} = (m+1) \beta_{n-1}\,dt$.
Starting with two objects with $n$ and $m$ atoms, jumps of an atom from $n$ to $m$ occur at rate $n\beta_m$,
whereas jumps the other way from $m$ to $n$ occur at rate $m\beta_n$.
$$
\lower 12pt\vbox{ \hbox{$\,n-$1}
                  \hbox{\ }
                  \hbox{$m+$1}
                }
\Bigg\{
\lower 12pt\vbox{ \hbox{{\raise2pt\hbox to 74pt{\hrulefill}} \hskip -72pt $\bullet$ \hskip 16pt \hskip 8pt \hskip 8pt \hskip 8pt \hskip 3pt $\bullet$ \hskip 9pt}
                  \hbox{\hskip 29.5pt $\downarrow$}
                  \hbox{{\raise2pt\hbox to 74pt{\hrulefill}} \hskip -66pt $\bullet$ \hskip 10pt $\bullet$ \hskip 8pt $\bullet$ \hskip 10pt $\bullet$ \hskip 2pt}
                }
\Bigg\}
\hskip10pt
\raise 2pt\hbox{${\buildrel n\beta_m \over {\hbox to 40pt{\leftarrowfill}}} $}
\hskip10pt
\lower 12pt\vbox{ \hbox{$n$}
                  \hbox{\ }
                  \hbox{$m$}
                }
\Bigg\{
\lower 12pt\vbox{ \hbox{{\raise2pt\hbox to 74pt{\hrulefill}} \hskip -72pt $\bullet$ \hskip 16pt $\bullet$ \hskip 8pt \hskip 8pt \hskip 3pt $\bullet$ \hskip 9pt}
                  \hbox{\ }
                  \hbox{{\raise2pt\hbox to 74pt{\hrulefill}} \hskip -66pt $\bullet$ \hskip 10pt \hskip 8pt \hskip 8pt $\bullet$ \hskip 10pt $\bullet$ \hskip 2pt}
                }
\Bigg\}
\lower 12pt\vbox{ \hbox{$n$}
                  \hbox{\ }
                  \hbox{$m$}
                }
\hskip10pt
\raise 2pt\hbox{${\buildrel m\beta_n \over {\hbox to 40pt{\rightarrowfill}}} $}
\hskip10pt
\Bigg\{
\lower 12pt\vbox{ \hbox{{\raise2pt\hbox to 74pt{\hrulefill}} \hskip -72pt $\bullet$ \hskip 16pt $\bullet$ \hskip 8pt $\bullet$ \hskip 3pt $\bullet$ \hskip 9pt}
                  \hbox{\hskip 46pt $\uparrow$}
                  \hbox{{\raise2pt\hbox to 74pt{\hrulefill}} \hskip -66pt $\bullet$ \hskip 10pt \hskip 8pt \hskip 8pt \hskip 8pt \hskip 10pt $\bullet$ \hskip 2pt}
                }
\Bigg\}
\lower 12pt\vbox{ \hbox{$\,n+$1}
                  \hbox{\ }
                  \hbox{$m-$1}
                }
$$
The time to the next event is exponentially distributed as
$$
    \Pr(\Delta t) = (n\beta_m + m\beta_n)\,\exp\!\big(-(n\beta_m + m\beta_n)\,\Delta t\big).
$$
When that event occurs, it is either $n$-to-$m$ or $m$-to-$n$ in ratio $n\beta_m\!:\!m\beta_n$.
Using an interval $\tau = {\cal O}(n+m)^{-1}$ of artificial time between observations offers each atom an ${\cal O}(1)$ chance of jumping.
This is the natural ``period'' for which to run Chameleon1 between a given pair of objects, randomly selected from the ensemble.

A suggested jump is accepted or rejected on the basis of the ensemble likelihood 
$$
  {\cal L}\ = \prod_{\rm objects} L({\rm object})
$$
in which the only changeable factors are the likelihoods of the source and destination.
According to the Metropolis-Hastings rule, a jump is accepted if
$$
  {\cal L}_{\rm new} \ge {\tt Uniform}(0,{\cal L}_{\rm old})
$$
and rejected otherwise.

It would be possible to use slice sampling to merge an atom's jump between objects with movement along the Hilbert curve, 
as was done in LifeStory for birth and death.
However, even that version would still be constrained to a fixed total number of atoms in the ensemble,
and I do not expect the extra complication would yield any worthwhile advantage in power.

\bigskip
\noindent{5.2.6. \it Chameleon2}
\smallskip

In the Chameleon2 engine, pairs of atoms exchange their ensemble membership.
Equivalently, they exchange positions.
$$
\vbox{\hbox{\hskip 47pt {$\scriptstyle\rm Select$}}
      \hbox{{\raise2pt\hbox to 140pt{\hrulefill}} \hskip -132pt $\bullet$ \hskip 36pt $\bullet$ \hskip 16pt $\bullet$ \hskip 25pt $\bullet$ \hskip 19pt}
      \hbox{\hskip 55.5pt $\downarrow$ \hskip 38pt $\uparrow$}
      \hbox{{\raise2pt\hbox to 140pt{\hrulefill}} \hskip -110pt $\bullet$ \hskip 60pt $\bullet$ \hskip 13pt $\bullet$ \hskip 9pt}
      \hbox{\hskip 87pt {$\scriptstyle\rm Exchange$}}
     }
$$
To encourage synergy between the jumps, the chosen atoms should be close together, 
so that the exchange atom should be a neighbour (in its own object) of the position of the atom first selected.
Detailed balance is then trivially assured.
The transitions do not affect the number of atoms in either object, so they stay faithful to the prior without needing to adjust the relative rates.
Suggested jumps are again accepted or rejected on the basis of the ensemble likelihood, according to whether or not
$$
  {\cal L}_{\rm new} \ge {\tt Uniform}(0,{\cal L}_{\rm old})\,.
$$

As was done for Chameleon1, it seems appropriate for an iterate to operate the engine on a given pair of objects
for long enough that each atom is offered an ${\cal O}(1)$ chance of jumping.

\vfill\eject

\noindent{$\underline{\hbox{\bf{Section 6. Massive Inference (MassInf)}}}$}
\bigskip

MassInf (an acronym for point-{\bf Mass} atoms in {\bf Inf}erence, originally with particular reference to {\bf mass} spectrometry) is an extension to BayeSys,
for use when attributes include additive flux parameters which relate to {\bf linear} data having Gaussian or Poisson errors.  
In this special but quite common case, the flux parameters can be processed semi-analytically, which means less load on the program, and enhanced power.
Each atom in the prior model has flux parameters $z$ which are treated separately from the other attributes~$x$.
For example, the atoms for a black-and-white image would have a single flux parameter representing brightness,
whereas atoms for a colour image might have three, one for each primary colour red, green, blue.
I call the number of such fluxes the {\tt Valency} as a mnemonic for the number of ways the atom can bond to data.
For imagery the fluxes would be positive, though in other applications $z$ might take either sign.

\bigskip
\noindent{6.1. MASSINF PRIORS}
\smallskip

Massive Inference allows a choice of four priors, selected by the {\tt MassInf} parameter.
In each case, the flux is factorised as
$$
    z = \zeta q
$$
where $q$ is a dimensional unit of flux (common to all the atomic fluxes in any single object), and $\zeta$ is the individual dimensionless coefficient.
The prior on $q$ is assigned as
$$
    \pi(q) = q_0^{-2}\,q\,e^{-q/q_0}
$$
where $q_0$ is an initial global guess about the expected magnitudes of the atomic fluxes,
and the ``$xe^{-x}$'' form is chosen to be tractable whilst keeping $q$ away from both 0 and $\infty$.

The hyperparameter $q_0$ should have little effect, because $q$ will usually be influenced much more strongly by the numerous data.
But it must be given: to be fully consistent with the strictest Bayesian paradigm, 
any dimensional dataset should be accompanied by a similarly dimensioned constant giving the expected size of phenomenon being observed.
Thus data on flux require a ``flux unit'' $q_0$ before they are interpretable.
I prefer you to provide this, but if you do not MassInf is able to cheat a little, and peek at the data to guess a value.

The four MassInf priors for the dimensionless fluxes are:-

\centerline{\vbox{\vskip 4pt
\offinterlineskip
    \halign{&\vrule#&\strut\quad\hfil#\hfil\quad&\vrule#&\strut\quad#\quad\hfil                     &\vrule#& \strut\quad#\hfil\quad                         &\vrule#\cr
 \noalign{\hrule}                                                                                                                                                    \cr
          height2pt &           \omit           &    &         \omit                                   &    &         \omit                                  &       \cr
                    &      {\tt MassInf}        &    &  \hfil Description                              &    &\qquad Prob($\zeta$)                            &       \cr
          height2pt &           \omit           &    &         \omit                                   &    &         \omit                                  &       \cr
 \noalign{\hrule}                                                                                                                                                    \cr
          height2pt &           \omit           &    &         \omit                                   &    &         \omit                                  &       \cr
                    &            0              &    & ``monkeys'', degenerate case with $\zeta$ fixed &    & $\delta(\zeta-1)$                              &       \cr
          height2pt &           \omit           &    &         \omit                                   &    &         \omit                                  &       \cr
                    &            1              &    & ``positive'', $\zeta > 0$, the commonest case   &    & $\exp(-\zeta)$                                 &       \cr
          height2pt &           \omit           &    &         \omit                                   &    &         \omit                                  &       \cr
                    &            2              &    & ``positive/negative'', $\zeta$ has either sign  &    & $\hbox{$1\over2$} \exp(-|\zeta|)$              &       \cr
          height2pt &           \omit           &    &         \omit                                   &    &         \omit                                  &       \cr
                    &            3              &    & ``Gaussian'', $\zeta$ has either sign           &    & $\exp(-\hbox{$1\over2$} \zeta^2) /\sqrt{2\pi}$ &       \cr
          height2pt &           \omit           &    &         \omit                                   &    &         \omit                                  &       \cr
 \noalign{\hrule}                                                                                                                                                    \cr
          }
       } }

The ``monkey'' prior ({\tt MassInf} = 0) models a team of monkeys throwing balls of equal magnitude $q$ at the unit box defined by the remaining coordinates $x$.
It was the original motivation for maximum entropy image reconstruction, as developed with the Cambridge group (Gull \& Daniell 1978).
On dividing the box into cells, the probability that $N$ balls will be distributed as $(n_1,n_2,\ldots)$ is given by the degeneracy
$$
   \Pr({\bf n}\mid N) \propto {N!\over n_1!\,n_2!\,\ldots} = \Omega,
$$
which suggests using the entropy $S = \log\Omega$ as a prior for flux.
Taking the model literally, though, imposes an unfortunate digitisation of flux to integer multiples of $q$ (Jaynes 1986 discusses related problems), 
and as noted in section 2.4 the smoothed entropy which approximates the factorials does not quite work as a Bayesian prior.

The ``positive'' prior ({\tt MassInf} = 1) avoids the monkey model's digitisation by letting the balls be of variable magnitude, as described by their exponential prior.
Although the BayeSys/MassInf system need not and does not do this, the unit box of spatial coordinates $x$ can be divided into cells.
Choosing a Poisson prior of mean $\alpha$ for the total number of atoms, a cell of size $\delta x$ will contain $r$ atoms, 
distributed as Poisson with mean $\mu = \alpha\,\delta x$.
$$
    \Pr(r) = e^{-\mu} \mu^r /r!, \qquad r=0,1,2,\ldots
$$
With the flux of each of the $r$ atoms distributed as
$$
    \Pr(\zeta_i) = \exp(-\zeta_i), \qquad i=1,2,\ldots,r
$$
the distribution for the total flux $\zeta = \sum\zeta_i$ in the cell is
$$
    \Pr(\zeta\mid r) = \cases{\delta(\zeta),                   &$r = 0$; \cr
                               e^{-\zeta}\,\zeta^{r-1}/ (r-1)!,&$r > 0$; \cr}
$$
and the net effect is that the total flux in the cell is distributed as
$$
    \Pr(\zeta) = \sum_{r=0}^\infty \Pr(\zeta\mid r)\Pr(r) = e^{-\mu}\big( \delta(\zeta) + e^{-\zeta} \sqrt{\mu/\zeta}\;I_1(2\sqrt{\mu\zeta}\,) \big)
$$
where $I_1$ is the first-order Bessel function.
It is intellectually satisfying to find that the number of atoms can be summed away analytically, albeit at the cost of doing a cell-wise computation afterwards.
It is also amusing to note that, almost regardless of the data,
the most probable (MAPP) inferred flux would thereby become precisely zero everywhere because of the surviving delta function ---
a delicious counter-example to MAPP estimation.
Pursuing the algebraic development beyond these remarks takes us too far afield, 
into the realms of measure theory and L\'evy-Khinchin representations (Feller 1971, Sibisi \& Skilling 1997).

The ``positive-negative'' prior ({\tt MassInf} = 2) is the natural generalisation of the positive-only
prior for applications where $z$ may take either sign.
Its cusp at zero is no disadvantage; it helps to reduce the flux of faint atoms of doubtful significance.

The ``Gaussian'' prior ({\tt MassInf} = 3) is an alternative prior when $z$ may be of either sign.
Its main disadvantage over ``positive-negative'' is that the expected flux $q$ is close to the r.m.s. value, 
so tends to be dragged up by any unusually large fluxes.
This leaves the fainter bulk of the fluxes less well controlled, whereas the largest fluxes are over-controlled and pulled back by the severe Gaussian tail.
Objects of large dynamic range are thus recovered less well.
The effect shows up quantitatively as a poorer (lower) value of the evidence for such objects, 
reflecting the improbability of having most fluxes well below the r.m.s. magnitude, and some well above.
On the other hand, the Gaussian prior $\exp(-x^2-y^2)$ on two fluxes $x$ and $y$ serves as a circularly symmetric prior for a complex $z=x+iy$, 
whereas the positive-negative $\exp(-|x|-|y|)$ does not.

\bigskip
\noindent{6.2. MASSINF LIKELIHOOD}
\smallskip

For MassInf to be used, the likelihood should factorise as
$$
    \Pr(D\mid {\rm atoms}) = L_{\rm Bayes}(x)\,L_{\rm MassInf}(z\mid x)
$$
with the flux-dependent part $L_{\rm MassInf}$ being either Gaussian as from linear data with normal errors, or of Poisson form from data counts.
The accompanying factor $L_{\rm Bayes}$ is often just 1 and ignorable, but it need not be.
For example, the data might be fluxes whose locations are themselves observed uncertainly,
implying explicitly location-dependent factors in the likelihood.

As for fluxes,
an atom of unit flux at position $x$ has a footprint (or ``point-spread-function'' or ``Green's function'') ${\bf f}(x)$ over the relevant data, 
several such if it has several fluxes.
The footprints of the constituent atoms and fluxes combine linearly to produce the mock data
$$
    {\bf F} = \sum z\,{\bf f}(x)
$$
as a list of numbers which compare with the actual data {\bf D}, through either Gaussian or Poisson statistics.
Bold font (${\bf D},{\bf F},\ldots$) here denotes data vectors.

\eject
\noindent {6.2.1. \it Gaussian data}
\smallskip

Under Gaussian statistics, data $D$ are accompanied by standard deviation uncertainties $\sigma$.
To keep the notation clean, we incorporate $\sigma$ into the definitions
$$
   {\bf X}\!\cdot\!{\bf Y} \equiv \sum X_k Y_k / \sigma_k^2, \qquad \|{\bf X}\|^2 \equiv {\bf X}\!\cdot\!{\bf X}
$$
of inner product and 2-norm.
We then write the chisquared misfit as
$$
    \chi^2({\bf F}) = \|{\bf F} - {\bf D}\|^2
$$
and this gives the likelihood
$$
    L_{\rm MassInf}({\bf F}) = {\cal Z}^{-1} e^{-\chi^2 / 2}
$$
where ${\cal Z} = \prod\sqrt{2 \pi \sigma_k^2}$ is the dimensional normalisation.

MassInf uses these formulas to calculate the likelihood efficiently.
When an atom is created (with new flux $z$) or destroyed (by removing its flux) 
or moved (by destroying it at the old location then creating it at the new), the program updates the mock data by
$$
    \Delta{\bf F} = {\bf f}(x)\,\Delta z
$$
and the likelihood by
$$
    \Delta(\log L_{\rm MassInf}) = - \Delta{\bf F}\!\cdot\!(\hbox{$1 \over 2$} \Delta{\bf F} + {\bf F} - {\bf D})
$$
which should be faster than re-calculating the factor from scratch.

\bigskip
\noindent {6.2.2. \it Poisson data}
\smallskip

Under Poisson statistics, data $D$ are counts accumulated in the various bins $k$ of the dataset.
These are accompanied by pre-subtracted background counts $B$, which increase the statistical uncertainties.
The likelihood of mock data $F$ is now
$$
    L_{\rm MassInf}({\bf F}) = \prod_k {(F_k + B_k)^{D_k + B_k} \over \Gamma(D_k + B_k + 1)}\; e^{-(F_k + B_k)}\,.
$$
The data are likely to start out as integers, 
but we allow generality by using the continuous gamma-function formulation $\Gamma(D_k + B_k + 1)$ instead of the integer form $(D_k + B_k)!$.

Without a background, the likelihood would become zero (with singular logarithm) if the mock data $F_k$ were 0 in any bin with positive count $D_k>0$.
Hence many trial objects, including the empty object from which BayeSys starts, would be singular.
Accordingly, there is a restriction that $B_k$ must be strictly positive wherever there are non-zero counts $D_k + B_k$.

When an atom is created or destroyed or moved, the mock data is incremented just as before, but the likelihood increment is a polynomial in $z$ instead of a Gaussian. 

\bigskip
\noindent{6.3. MASSINF FLUX UNIT}
\smallskip

MassInf uses Gibbs sampling to explore the joint distribution of the flux unit $q$ and the other variables, 
alternating between re-calibrating $q$ and performing the rest of its calculation to re-sample the atoms.
At the beginning of each iterate, before updating the ensemble, MassInf re-samples~$q$.
Temporarily write the mock data as ${\bf F}(z) = q{\bf F}^*(\zeta)$ to make explicit its dimensional scaling with~$q$.
With all other variables (including $\zeta$) fixed, the $q$-dependence of the likelihood is (exactly for Gaussian data and approximately for Poisson data)
$$
    \Pr(D\mid q,\ldots)\ \propto\ \exp(\,q {\bf F}^*\!\cdot\! {\bf D} - \hbox{$1 \over 2$} q^2 {\bf F}^*\!\cdot\! {\bf F}^*)\,.
$$
The Poisson approximation, incidentally, doesn't matter.
It just means that the implementation uses a prior for $q$ that differs slightly from the algebraic definition in this document.
The likelihood combines with the prior for $q$ to give a joint distribution
$$
    \Pr(D,q\mid\ldots)\ \propto\ q \exp( - q/q_0 + q {\bf F}^*\!\cdot\! {\bf D} - \hbox{$1 \over 2$} q^2 {\bf F}^*\!\cdot\! {\bf F}^*)
$$
which in turn is proportional to the posterior $\Pr(q\mid D,\ldots)$.
To re-calibrate $q$, MassInf re-samples from this ``$x\times {\rm Gaussian}(x)$'' distribution.
That deals with $q$.
For the rest of the calculation, $q$ is merely a constant that can be included in the appropriate prior for $z$.

\vfill\eject
\noindent{6.4. MASSINF FLUXES}
\smallskip

The BayeSys engines modify only one, or sometimes two, atoms at a time.
For exposition, we first consider modifying only one atom (say the $k^{\rm th}$), in the single-flux case ${\tt Valency} = 1$.
Let
$$
  {\bf F}^- = \sum_{i \ne k} z_i{\bf f}(x_i)
$$
be the mock data from all the other atoms, with the selected $k^{\rm th}$ removed.

Taking Gaussian data, the dependence of the likelihood on the selected flux $z_k$ (with all other variables fixed) is
$$
    \Pr(D\mid z_k, x_k, \ldots)\ \propto\
    \exp\big(-z_k ({\bf F}^-\! - {\bf D})\!\cdot\! {\bf f}(x_k) - \hbox{$1 \over 2$}\,z_k^2\,{\bf f}(x_k)\!\cdot\! {\bf f}(x_k)\big)
$$
and this combines with (say) the ``positive'' prior for $z_k$ to give a joint distribution
$$
    \Pr(D,z_k\mid x_k, \ldots)\ \propto\
    \exp\big( - z_k/q - z_k ({\bf F}^-\! - {\bf D})\!\cdot\! {\bf f}(x_k) - \hbox{$1 \over 2$}\,z_k^2\,{\bf f}(x_k)\!\cdot\! {\bf f}(x_k) \big)
$$
which in turn is proportional to the posterior $\Pr(z_k\mid D,x_k,\ldots)$.
Hence we can reset $z_k$ by sampling directly from this ``truncated Gaussian($\cdot$)'' distribution: we don't have to guess trial values which may be rejected.
The other MassInf priors give similarly tractable distributions.
Moreover, the joint distribution is {\bf integrable} to an error function.
$$
    \Pr(D\mid x_k, \ldots) = \int_0^\infty \Pr(D,z_k\mid x_k, \ldots)\,dz_k = \int_0^\infty {\rm Gaussian}(z)\,dz \ \Rightarrow\ \hbox{erf(coefficients)}
$$
This is an explicit expression for the likelihood (or, if you prefer, the evidence) of $x_k$ alone.
Hence we can explore the location $x$ on its own, with the standard BayeSys engines, {\bf without} having to find the flux $z$ at the same time.
Dimensionality is reduced and exploratory power is improved.
Implicitly, all flux values are explored in parallel, and we delay picking an flux until {\bf after} a new acceptable location has been found.

\vbox
{
  {$$
       \matrix{       \cr                                       \cr \bullet                               \cr                                \cr       \cr}
       \matrix{       \cr                                       \cr                                       \cr\buildrel{\rm kill}\over\searrow\cr       \cr}
       \matrix{       \cr                                       \cr                                       \cr                                \cr  0    \cr}
       \matrix{       \cr                                       \cr                                       \cr \buildrel{\rm sum}\over\nearrow\cr       \cr}
\left\lfloor
       \matrix{ \cdot \cr   \cdot                               \cr  \cdot                                \cr  \cdot                         \cr \cdot \cr}
\right\rfloor
       \matrix{       \cr                                      \cr\ \buildrel\delta x\over\Longrightarrow\ \cr                               \cr       \cr}
\left\lfloor
       \matrix{ \cdot \cr   \cdot                               \cr  \cdot                                \cr  \cdot                         \cr \cdot \cr}
\right\rfloor
       \matrix{       \cr\buildrel{\rm pick}\over\longrightarrow\cr                                       \cr                                \cr       \cr}
       \matrix{       \cr \bullet                               \cr                                       \cr                                \cr       \cr}
  $$}
  {$$
  \hbox{Old $z$}\hskip 12pt \hbox{0}\hskip 20pt \hbox{All $z$}\hskip 26pt \hbox{All $z$}\hskip 14pt \hbox{New $z$}
  $$}
}
\noindent
Also, Gaussian forms over the positive quadrant remain tractable in 2 dimensions, 
so that the LifeStory2 engine (which moves two atoms in the same object) still works.
The fluxes of both atoms integrate out together, and are picked together after the new locations have been found.

Similar analysis holds for Poisson data, except that the error functions are replaced by polynomials.
The main practical difference is that their evaluation cost scales quadratically with the number of counts in an atom's footprint, 
or even cubically if the LifeStory2 engine is used, instead of merely scaling proportionally to the length of the footprint as for Gaussian data.

If an atom has several fluxes (${\tt Valency} > 1$), the analysis continues to hold, with the $z$-integrals being promoted to more dimensions.
However, the arithmetical evaluation of these relies on separability.
So, for practical implementation, an atom's footprints over the data must not overlap.
Thus you could not use MassInf if you had red, green, blue footprints from the same atom blurred into the same measurements.
If you want to use the LifeStory2 engine with ${\tt Valency} > 1$, there is a further restriction,
that a single footprint from one atom must not overlap more than one of the footprints from another.

To run MassInf, all you need to supply are the data $D$ and their uncertainties $\sigma$ or background $B$,
along with a procedure to give the footprint(s) ${\bf f}(x)$ at a trial position $x$.
The program takes care of the rest.
\vfill\eject

\noindent{$\underline{\hbox{\bf{Section 7. Display of results}}}$}
\bigskip

A posterior object derived from an atomic prior is intrinsically spiky.
As a function of $x$ it is a sum of delta functions, one at each location of an atom.
Likewise, any accumulation of objects remains a set of delta functions: summing over many objects merely produces more.
Yet most likelihood functions are spatially smooth because data have finite resolution.
So the actual posterior ought to be correspondingly smooth.
It is true that, for any pre-specified display resolution, long-term averaging will eventually produce a smooth result,
but only after large numbers of atoms have chanced to fall into each (small) cell $\delta x$.
This can take far too long to compute directly.
For display purposes, we want to show a limited number of atoms as a smooth function.
Note that this is not part of Bayesian analysis: it is openly a matter of professional communication and aesthetics.

BayeSys gives you some guidance.
Each atom it produces is accompanied by an estimate of its width, 
in the form of that fraction $f$ of the hypercube's volume that the atom could plausibly range through.
You may use this estimate as you please, depending upon your current application.
In one dimension $(d=1)$, the hypercube is just the unit interval $[0,1]$, and $f$ is a range $\delta x$ within it.
For display, an atom would presumably be shown as some sort of bell-shaped curve of correct area, correct centre, and width related to~$f$.
In two dimensions $(d=2)$, it would be natural to give each coordinate $(x_1,x_2)$ a width $f^{1/2}$.
Generally, in $d$ dimensions, a width $f^{1/d}$ might be appropriate so that the volume remains $f$.
On the other hand, you may wish to divide $f$ differently among the various dimensions, if these represent intrinsically different quantities.
You decide.

To find $f$, BayeSys keeps a historical log of its atoms, biassed towards the more recent and presumably better-equilibrated ones.
These many atoms are all put onto a standard Hilbert curve, along which their mean density
$$
  \rho(\ell) \propto {\hbox{number of atoms in interval} \over \hbox{length of interval $\Delta \ell$}}\ ,\quad \hbox{normalised to $\int\rho(\ell)d\ell=1$},
$$
is tolerably well defined over intervals wide enough to cover at least several atoms.
When a new object is produced, having $n$ atoms, each may be expected to range over a fraction $1/n$ of the atoms in the log.
Perhaps half of this may be assigned to randomness we might wish to use for smoothing, yielding a full width
$$
  f = \delta \ell = 1 / (2 \rho n)
$$
expressed as a fraction of the length of the Hilbert line, or equivalently as a fraction of the total prior volume.
This can conveniently be computed as a fixed atom-count $\pm {1\over 2} f {\cal N}$ to either side across the accumulated log of ${\cal N}$ atoms.

In ``important'' parts of the posterior, where accumulated atoms congregate closely, the width is small.
It is less than the size of the feature, so does not appreciably degrade the delineation of that part.
In unimportant parts of the posterior, where a few weakly-constrained atoms may wander about to form some broad background, the width becomes large.
Such stray atoms are displayed as wide and shallow, so that they no longer obtrude upon the eye as isolated sharp spikes.
As a technical refinement, BayeSys uses whichever of the leftward-looking and rightward-looking estimates of $\rho$ is larger.
This narrows the width so as to discourage atoms near the edge of an important domain from spilling out into the background,
which could degrade the display's visible resolution if it were allowed.

The diagram illustrates a log of atoms ``{\tt o}'' placed on the Hilbert line ``{\tt ...}''.
Below this are five atoms of a current object ``$\,\mid\,$'', 
and sufficient atoms are in the log that the fixed atom-count is 2 to either side
(in practice the count would be more because the log would be fuller, and the log would be sparse among the huge number of points on the Hilbert line).
Arrows in the diagram show how the widths are constructed, by counting 2 atoms across the log to whichever side is closer.
$$
\vbox{
      \hbox{\tt ..o..oo....oo.oo.....oo.ooo....ooooooo..ooooo...ooo.o...o...o....o...}
      \hbox{
            \hskip 45.4pt    \hbox to 16.0pt{\leftarrowfill} $\!\!\mid\!\!$ \hbox to 16.0pt{\rightarrowfill}
            \hskip 30.3pt    \hbox to 16.0pt{\leftarrowfill} $\!\!\mid\!\!$ \hbox to 16.0pt{\rightarrowfill}
            \hskip 20.0pt    \hbox to 10.5pt{\leftarrowfill} $\!\!\mid\!\!$ \hbox to 10.5pt{\rightarrowfill}
            \hskip 15.0pt    \hbox to 10.5pt{\leftarrowfill} $\!\!\mid\!\!$ \hbox to 10.5pt{\rightarrowfill}
            \hskip 19.8pt    \hbox to 31.5pt{\leftarrowfill} $\!\!\mid\!\!$ \hbox to 31.5pt{\rightarrowfill}}
     }
$$
Atoms in or at the edge of a dense region are narrow, whereas atoms in sparse regions are wide.

\vfill\eject

\centerline{\bigger PART 3. PRACTICE}
\bigskip
\noindent{$\underline{\hbox{\bf{Section 8. BayeSys prior parameters}}}$}
\bigskip

BayeSys models an object as a sum of ``{\bf atoms}'', which are simpler than the object itself.  
It is possible to consider a complicated object as just a single atom with many attributes, but the composite model is often appropriate.  
The composite model also aids the computational exploration of plausible objects, 
because adjusting one or two atoms at a time is likely to be easier than trying to adjust the entire object.  
Statisticians call this sum of atoms a ``mixture model''.

\bigskip
{\tt MinAtoms}, {\tt MaxAtoms}, {\tt Alpha}:

\noindent Being probabilistic, BayeSys needs a prior distribution for your object.
For the number of atoms, you need to supply an inclusive range $[{\tt MinAtoms},{\tt MaxAtoms}]$, 
in which {\tt MinAtoms} must be at least 1 (simply because the anomalous empty object is algorithmically awkward), 
and {\tt MaxAtoms} must obviously be at least as large as {\tt MinAtoms} (though ${\tt MaxAtoms}=0$ is interpreted as infinity, or no limit at all). 
If you supply ${\tt MaxAtoms}={\tt MinAtoms}$, that completes this part of the prior by forcing exactly that number of atoms.  
Otherwise, you have some control over the prior between {\tt MinAtoms} and {\tt MaxAtoms} (or infinity), 
through a third parameter {\tt Alpha} which can be zero (for uniform) or positive (for Poisson) or negative (for geometric).
The details are documented in section 5.1.1, but all you need to know is the rough behaviour
$$
\matrix{
     {\tt Alpha} > 0,  &\quad {\tt Natoms} \approx \alpha \pm \sqrt\alpha   \hfill\cr
     {\tt Alpha} = 0,  &\quad {\tt Natoms}  \hbox{ uniform}                 \hfill\cr
     {\tt Alpha} < 0,  &\quad {\tt Natoms} \approx |\alpha| \pm |\alpha|    \hfill\cr
       }
$$ 
each offset by ${\tt MinAtoms} \geq 1$ and not exceeding ${\tt MaxAtoms} \geq {\tt MinAtoms}$.  
Generally, I recommend using the less committal geometric choice, 
because most users do not have strong pre-conceptions about the number of atoms needed to build their object.  
Not that it matters much.  
Extensive data should soon come to dominate your prior assignment.

Incidentally, if you supply silly parameters, such as ${\tt MinAtoms} = 4$ with ${\tt MaxAtoms} = 2$, 
BayeSys will abort its run and exit to you with an appropriate negative return value.  
The same will happen if any other error state is detected during a run.

\bigskip
{\tt Ndim}, {\tt Valency}:

\noindent An atom will have some number of attributes, specified as the dimensionality {\tt Ndim}.  
These are represented by floating-point (``{\tt double}'') coordinates

\ \ \qquad\qquad  {\tt Cubes[r][i]},\qquad or\ \ {\tt Cube[i]}\ \ where\ \ {\tt Cube = Cubes[r]},

\noindent where $i = 0,1,\ldots,{\tt Ndim}-1$ gives the location and $r=0,1,2,\ldots$ lists the atoms in the model.  
Within BayeSys, the coordinates {\tt Cube[i]} of each atom range between 0 and 1, and have uniform prior.  
However, BayeSys includes a procedure {\tt BayeShape}, documented in section 5.1.2, that lets you transform some or all
of the {\tt Cube} values to new {\tt Coord} values in various standard shapes.

You can also instruct BayeSys to ignore parts of the prior, and concentrate the prior mass into the remaining volume.  
If your attributes still do not all fall neatly into one of these shapes, 
you will have to work out how to transform my {\tt Cube[.]} or {\tt Coord[.]} coordinates to your $X$ attributes, 
so that the transformation's Jacobian $\partial({\tt Cube})/\partial(X)$ properly represents your prior (my prior on {\tt Cube} is 1):
$$
     {\rm YourPrior}(X)\,d(X) = {\rm MyPrior}({\tt Cube})\,d({\tt Cube}) .
$$
You will not need the reverse ($X \rightarrow {\tt Cube}$) transformation, because BayeSys supplies you with locations, not the other way round.

Although they will come to take their own attribute values, the atoms are assumed to be {\it a-priori}-equivalent, and exchangeable. 
Do not assume that their serial numbers (``$r$'' in the {\tt Cubes[r][.]} list) are meaningful, 
because BayeSys creates, destroys, moves, exchanges and shuffles the atoms beneath you.  
However, you can allow atoms to be of effectively different types, by allowing one of the attributes to encode an integer switch.

Finally, {\tt Valency} enables the ``Massive Inference'' extension (sections 6 and 13).  
It allows flux-related attributes, stored in {\tt Cube[i]} for $i={\tt Ndim},\ldots,{\tt Ndim+Valency-1}$, to be treated separately.
For now, set ${\tt Valency} = 0$.

\vfill\eject
\noindent{$\underline{\hbox{\bf{Section 9. BayeSys algorithm parameters}}}$}
\bigskip

In order to run BayeSys, you must initialise four algorithm parameters 
(in addition to the five prior parameters {\tt MinAtoms}, {\tt MaxAtoms}, {\tt Alpha}, {\tt Ndim}, {\tt Valency}).  
These are {\tt ENSEMBLE}, {\tt Method}, {\tt Rate}, and {\tt Iseed}.

\bigskip
{\tt ENSEMBLE}:

\noindent {\tt ENSEMBLE} is the number of trial objects that the BayeSys program evolves simultaneously.  
You can run the program with ${\tt ENSEMBLE}=1$ to use only one object.  
For an easy problem, this can work fine.  
For more difficult applications, though, it is safer to run with several, in case any single object becomes stuck near an awkward location.  
If the program holds several objects, 
it will use its initial ``annealing'' phase to overwrite with better ones any of them that become stuck at anomalously low likelihoods.
The ensemble as a whole is much less likely to get stuck, and can also evolve more efficiently, 
especially if the posterior distribution has local maxima that act as traps.
Hence the apparent computational penalty of a factor of {\tt ENSEMBLE} may be reduced or reversed.  Try it.  
And don't forget that you may want to accumulate lots of sample objects at the end anyway, when a large ensemble helps.


\bigskip
{\tt Method}:

\noindent{\tt Method} is a combination of binary switches that chooses which of the various exploration engines BayeSys is to use.  
These are explained in section 5.2, where you will find my general recommendation to set all switches ``on'' by using ${\tt Method} = -1$.  
The opposite, simplest, alternative, is ${\tt Method} = 0$.  Beyond that, the most important switch to know about is in the ``{\it two}s'' bit, 
({\it i.e.} ${\tt Method}\,\&\,2$), which switches on the powerful ``LifeStory2'' engine.  
There are reasons, such as laziness when writing a BayeSys application, or having a particularly complicated MassInf problem, 
or merely a suspicion that the extra power is proving too expensive, for wanting to turn the LifeStory2 engine off, 
which you do by making {\tt Method}/2 even so that ${\tt Method}\,\&\,2$ is 0 (perhaps by using ${\tt Method} = -3$ which switches all other bits on).


\bigskip
{\tt Rate}:

\noindent{\tt Rate} is your control over the speed of annealing, 
which is the initial path the program takes from exploring the prior (which is easy) to exploring the posterior (which is best reached gradually).  
The annealing schedule (section 4.1) should be robust, and usually a value in the general range 0.1 to 0.5 should be safe enough.  
Set {\tt Rate} too high and the program might go too fast to find the little patch of high posterior that you seek.  
Set {\tt Rate} too low, and the program will waste time crawling.  
There is no adequate theory to guide us here, but you should be able to find a suitable value for any particular type of problem.  
You can use an occasional slow run with excessively small {\tt Rate} to check that your results are stable, even if I can't guarantee that they are correct.


\bigskip
{\tt Iseed}:

\noindent{\tt Iseed} is the seed for BayeSys' random number generator. For reproducible runs, give it a positive value.  
If you give it a negative value, the program will take the current clock time for its seed.  
Hence different runs, started a second or more apart, will do different random exploration, which can be handy for generating statistics.  
The positive seed actually used will overwrite your value, so that you can still reproduce any individual run if you record this number.
\bigskip

The above are the essential nine parameters you must set before calling BayeSys.

\vfill\eject
\noindent{$\underline{\hbox{\bf{Section 10. BayeSys structures}}}$}
\bigskip

The BayeSys interface uses two types of structure, a {\tt CommonStr} whose information is common to all the objects in the ensemble,
and an {\tt ObjectStr} for each individual object.
These structures are declared in the header file bayesys3.h accompanying the bayesys3.c source,
and each includes a {\tt void*} ``user'' pointer which you can use to address your own application-specific information.

\bigskip
{\tt CommonStr}:

\noindent You supply the prior and algorithm parameters in the single ``{\tt CommonStr}'' structure.  
For example, the ``bayestoy.c'' toy example program has (in effect) the lines:-
$$
\matrix{
     {\tt CommonStr\ \ Common}[1];\hfill &                    &                                                        \cr
     {\tt Common}\rightarrow{\tt MinAtoms}       \hfill &= 1;         \hfill &//{\tt \ >=\ 1}                                   \hfill\cr
     {\tt Common}\rightarrow{\tt MaxAtoms}       \hfill &= 0;         \hfill &//{\tt \ >=\ MinAtoms,\ or\ 0\ =\ infinity}       \hfill\cr
     {\tt Common}\rightarrow{\tt Alpha}          \hfill &= -1.0;      \hfill &//{\tt \ +ve\ for\ Poisson,\ -ve\ for\ geometric} \hfill\cr
     {\tt Common}\rightarrow{\tt Ndim}           \hfill &= 2;         \hfill &//{\tt \ dimension\ =\ \#\ coordinates}           \hfill\cr
     {\tt Common}\rightarrow{\tt Valency}        \hfill &= 0;         \hfill &//{\tt \ \#\ MassInf\ fluxes\ per\ atom}          \hfill\cr
     {\tt Common}\rightarrow{\tt ENSEMBLE}       \hfill &= 10;        \hfill &//{\tt \ \#\ objects\ in\ ensemble}               \hfill\cr
     {\tt Common}\rightarrow{\tt Method}         \hfill &= -1;        \hfill &//{\tt \ Algorithm\ method}                       \hfill\cr
     {\tt Common}\rightarrow{\tt Rate}           \hfill &= 0.1;       \hfill &//{\tt \ Speed\ of\ calculation\ (dimensionless)} \hfill\cr
     {\tt Common}\rightarrow{\tt Iseed}          \hfill &= 4321;      \hfill &//{\tt \ Random\ seed,\ -ve\ is\ time\ seed}      \hfill\cr
     }
$$

You can also use its components {\tt Ndata}, {\tt Data} and {\tt Acc}, to set up your data and its accuracy (section 6.2.1).  
I usually use accuracy (= $1/\sigma$) instead of the direct standard deviation $\sigma$ because it's more usual 
to have poor or absent measurements with infinite $\sigma$ and zero accuracy than to have perfect data with zero $\sigma$ and infinite accuracy,
and it's easier to assign 0 than $\infty$.
$$
\matrix{  
    {\tt int\ \ \ \ Ndata} \hfill &= {\tt .....};        \hfill  &                                       &              \cr
    {\tt double\  Data[.]} \hfill &= {\tt \{..,..,..\}}; \hfill  &                                       &              \cr
    {\tt double\  Acc [.]} \hfill &= {\tt \{..,..,..\}}; \hfill  &                                       &              \cr
    {\tt Common}\rightarrow{\tt Ndata}    \hfill &= {\tt Ndata};        \hfill  &//{\tt \ \# data}              \hfill  &              \cr
    {\tt Common}\rightarrow{\tt Data}     \hfill &= {\tt Data};         \hfill  &//{\tt \ data}                 \hfill  &{\tt [Ndata]} \hfill\cr
    {\tt Common}\rightarrow{\tt Acc}      \hfill &= {\tt Acc};          \hfill  &//{\tt \ accuracies = 1/sigma} \hfill  &{\tt [Ndata]}\qquad\qquad\ \  \cr
     }
$$
When using the MassInf extension, this convention about {\tt Ndata}, {\tt Data}, {\tt Acc} is enforced (sections 6.2.1 and 14.1),
though for Poisson data (sections 6.2.2 and 14.2) {\tt Acc} should contain the pre-subtracted background $B$ instead.
If you don't have a list of ordinary data fitting this scheme, you can supply anything else in a {\tt UserCommon} structure, which you can define as you like.  
I will pass this around for you if you let the {\tt CommonStr} structure know about it, as in the following pseudo-code.
$$
\matrix{  
    {\tt typedef\ struct}                     \hfill  &                                         \cr
    {\tt \lbrace}                             \hfill  &                                         \cr
    {\tt \ \ ....}                            \hfill  &//{\tt \ declare\ your\ structure}\qquad\quad \hfill\cr
    {\tt \rbrace\ \ UserCommonStr;}           \hfill  &                                         \cr
    {\tt UserCommonStr\ UserCommon[1];}       \hfill  &//{\tt \ allocate\ structure}      \hfill\cr
    {\tt UserCommon}\rightarrow{\tt ....\ =\ ....;}          \hfill  &//{\tt \ assign\ values}           \hfill\cr
    {\tt Common}\rightarrow{\tt UserCommon\ =\ UserCommon;}  \hfill  &//{\tt \ tell\ BayeSys\ about\ it} \hfill\cr
     }
$$

Often, you will use {\tt UserCommon} to accumulate the output statistics that are the principal results of your computation.

\vfill\eject
{\tt ObjectStr}:

\noindent Each object has its own {\tt ObjectStr} structure, again declared in the bayesys3.h header file.  
So, if you set ${\tt ENSEMBLE} = 10$, you will need to assign
$$
    {\tt ObjectStr\  Objects[10];\ \ \ \ \ //\ [ENSEMBLE]},\ \ \qquad\qquad\qquad\qquad\qquad\qquad\qquad
$$
and can then address any individual object by setting
$$
    {\tt ObjectStr*\  Object\ =\ \&Objects[.];}\ \qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\quad
$$
You don't need to place anything in these objects: it's my job to give objects to you, not the other way round.  
I supply you with trial lists of atoms, as
$$
\matrix{  
    {\tt int}\hfill &{\tt Object}\rightarrow{\tt Natoms;}\qquad\hfill &// {\tt \ contains\ \#\ of\ atoms}\qquad\qquad\qquad\cr
    {\tt double**}\ &{\tt Object}\rightarrow{\tt Cubes;}\qquad\hfill &// {\tt \ contains\ Cubes[r][i]},  \hfill\cr
     }
$$
You can address an individual atom by
$$
    {\tt double*\quad\ Cube\ =\ Cubes[.];}\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\quad
$$
If you wish, you can change some number ${\tt N} \le {\tt Ndim}$ of my {\tt Cube} coordinates to your own {\tt Coord} values by using
$$
    {\tt BayeShape(\ Coord,\ \&Cube[}first{\tt],\ N,\ Shape\ );} \qquad\qquad\qquad\qquad\qquad\qquad\qquad
$$
to transform to one of the shapes ${\tt Shape} = 0,1,\ldots,8$, as documented in section 5.1.2.

Your procedures should then provide me with corresponding information that lets me set
$$
    {\tt double\ \ \ \ \ Object}\rightarrow{\tt Lhood;\ \ \ \ \ \ //\ log\ Pr(Data|Object)}.\ \qquad\qquad\qquad\quad
$$
However, I may need to know where to put ancillary information that you want to keep while computing this information.  
With a list of ordinary data, you may well want to keep and use the corresponding list of mock data derived from my trial object, 
so, for each object in the ensemble, you would set
$$
    {\tt Object}\rightarrow{\tt Mock}\ = \hbox{\ pointer to array of {\tt Ndata} ``{\tt double}''.}\ \qquad\qquad\qquad\qquad\quad\quad
$$
Again, this convention is enforced when using the MassInf extension.
If you don't have ordinary data fitting this scheme, you can supply anything else in your own {\tt UserObject} structure, which you can define as you like.  
I will pass this around for you if you let the {\tt ObjectStr} structure know about it, by setting
$$
    {\tt Object}\rightarrow{\tt UserObject}\ = \hbox{\ address of your object structure.}\ \qquad\qquad\qquad\qquad\quad
$$
It's not usually necessary, though.

\vfill\eject
\noindent{$\underline{\hbox{\bf{Section 11. User procedures}}}$}
\bigskip

BayeSys works by inserting and deleting atoms, at locations of its choosing. 
You have to tell it the corresponding effect on the likelihood.  
You do that by supplying six procedures, {\tt UserEmpty}, {\tt UserTry1}, {\tt UserTry2}, {\tt UserInsert1}, {\tt UserInsert2} and {\tt UserDelete1}.  
This looks complicated, but it need not be.
Each procedure can build your object from scratch, as in my example code in the bayesapp.c source, 
so all you need to re-program is that central building block in the example source-code.  
Then, when you are satisfied that an application is working, it is usually possible to gain efficiency by coding just the incremental changes that BayeSys needs
(and, of course, checking that your results don't change).
If you are switching the LifeStory2 engine off (by setting ${\tt Method}\,\&\,2 = 0$), 
you don't need to write {\tt UserTry2} or {\tt UserInsert2}, though you still have to satisfy the linker by supplying dummy procedures for them.
 
If you find yourself in an error state in any of your procedures, just return with a negative code as the (signed integer) return value.  
BayeSys should unwind cleanly, undoing its memory assignments, and will exit straight away with your negative value as its own return value.

\medskip
{\tt UserEmpty}:

\noindent {\tt UserEmpty} presents you with no atoms at all (${\tt Natoms}=0$), and asks for the corresponding logLikelihood.  
Usually this is just some dimensional constant that offsets the output ${\tt Evidence} = \log \Pr(D)$, but has no other effect on the calculation.


\medskip
{\tt UserTry1}:

\noindent {\tt UserTry1} asks you for the change in logLikelihood that {\bf would} occur {\bf if} I was to include a new trial atom in the list.  
For convenience, BayeSys will present this atom to you just beyond the current list, at {\tt Cubes[Natoms][.]}.  
Do not change your mock data or similar ancillary information: all I want is the single number $\Delta(\log L)$.  
Normally you will supply a positive return value in {\tt UserTry1}, but if my trial location is unacceptable to you, return with 0, and I will discard the suggestion.
 

\medskip
{\tt UserTry2}:

\noindent {\tt UserTry2} asks you for the change in logLikelihood that {\bf would} occur {\bf if} I was to add either one or two new trial atoms to the list.  
Again, BayeSys will present these atoms to you just beyond the current list, at {\tt Cubes[Natoms][.]} and at {\tt Cubes[Natoms+1][.]}.  
Again, do not change your mock data or such ancillary information.  
Normally, you will supply a positive return value, but if either location is unacceptable, return with 0.


\medskip
{\tt UserInsert1}:

\noindent {\tt UserInsert1} informs you that {\bf I have just inserted} one new trial atom into the current list.  
BayeSys presents this atom to you at the end of the list, at {\tt Cubes[Natoms-1][.]}, the value of {\tt Natoms} having just been incremented.  
You should give me the change in logLikelihood (not the new value), and should also update your mock data or ancillary information now.  
The location will already have been accepted by {\tt UserTry1} or {\tt UserTry2}, so it doesn't matter whether you return with a code of 0 or a positive value.
 

\medskip
{\tt UserInsert2}:

\noindent {\tt UserInsert2} informs you that {\bf I have just inserted} two new trial atoms into the current list.  
BayeSys presents these atoms to you at the end of the list, at {\tt Cubes[Natoms-2][.]}, {\tt Cubes[Natoms-1][.]}, 
the value of {\tt Natoms} having just been increased.  
You should give me the change in logLikelihood (not the new value), and should also update your mock data or ancillary information now.  
The location will already have been accepted by {\tt UserTry2}, so it doesn't matter whether you return with a code of 0 or a positive value.
 

\medskip
{\tt UserDelete1}:

\noindent {\tt UserDelete1} informs you that {\bf I have just deleted} one atom from the current list.  
BayeSys presents this atom to you just beyond the current list, at {\tt Cubes[Natoms][.]}, the value of {\tt Natoms} having just been decremented.  
You should give me the change in logLikelihood (not the new value), and should also update your mock data or ancillary information.

\medskip
{\tt UserFoot}:

\noindent Additionally, the MassInf extension in BayeSys asks for a different procedure {\tt UserFoot} (see section 14) instead of the above six.  
If you are not using MassInf, you will need to supply a dummy {\tt UserFoot}.

\vfill\eject
\noindent{$\underline{\hbox{\bf{Section 12. UserMonitor}}}$}
\bigskip

The final procedure you need is {\tt UserMonitor}, which BayeSys calls once per iterate.  
This is where you monitor the calculation, collect your statistical inferences, and calibrate any parameters you need. 
If you have any such nuisance parameters $\phi$ in your likelihood $\Pr(D\mid \theta,\phi)$, you should set them here in {\tt UserMonitor}, 
perhaps by optimising but preferably by sampling from their conditional probability $\Pr(\phi\mid \theta,D)$. 
BayeSys gives you access to the entire system here, including each {\tt ObjectStr}.  
From the {\tt CommonStr}, you may read off the current values of the annealing coolness
$$
    {\tt Common}\rightarrow{\tt cool} = \lambda \qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad
$$
and the evidence
$$
    {\tt Common}\rightarrow{\tt Evidence} = \log \int {\rm Likelihood}(\theta)\, d{\rm Prior}(\theta)\ \qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\quad
$$
where (temporarily) I define
$$
    {\rm Likelihood}(\theta) = \Pr(D\mid \theta)^{\rm cool}\ ,
$$
and of the diagnostic
$$
    {\tt Common}\rightarrow{\tt Information}
       = \int \log\Big({{\rm Posterior}(\theta)\over{\rm Prior}(\theta)}\Big) \,d{\rm Posterior}(\theta)\qquad\qquad\qquad\qquad\qquad\qquad\quad
$$
where, again using the temporary definition,
$$
    {\rm Posterior}(\theta) \propto {\rm Likelihood}(\theta)\, {\rm Prior}(\theta),\quad \hbox{normalised to 1.}
$$
Being minus the entropy of the posterior over the prior, 
the {\tt Information} is the logarithm of the volumetric factor by which the prior has been compressed to become the current posterior.  
Hence the moniker.

{\tt UserMonitor} is also where you limit the annealing.  
BayeSys brings in the data gradually, by using fractional powers $\Pr(D\mid \theta)^{\rm cool}$ of the likelihood instead of the likelihood itself, 
and gently raising {\tt cool} from zero.  
At the beginning, ${\tt cool} = 0$, and the program merely explores the prior.  
By the time {\tt cool} has reached~1, the program should be exploring the posterior.  
Left to itself, BayeSys will crank {\tt cool} up to arbitrarily high values, by which time it will be acting as a maximum-likelihood search.  
That may be what you want --- BayeSys can be a usefully powerful maximiser --- but for probabilistic calculation you need to stop at ${\tt cool} = 1$.  
Do this with a line of code equivalent to
$$
    {\tt if(\ Common}\rightarrow{\tt cool\ >\ 1.0\ )\ \ \ Common}\rightarrow{\tt cool\ =\ 1.0;} \qquad\qquad\qquad\qquad\qquad\qquad\qquad\quad
$$
in {\tt UserMonitor}.  
This prevents BayeSys from annealing further, 
and it will continue to explore the posterior until you tell it to stop by returning from {\tt UserMonitor} with a positive code instead of the usual zero.

I suggest that you allow the system to explore the posterior for about as many iterates as it took to anneal.  
The diagnostic ${\tt Common}\rightarrow{\tt Nsystem}$ records the serial number of the current iteration.  
There is little point in using less than 50\% of your total CPU to anneal, because you can save at most a factor of two, and the annealing may become less robust.  
Equally, there is seldom much point in using more than 50\% of your CPU to anneal, if exploration is your aim.

To help your display, I append to each 
atom my estimate of its uncertainty in position (section 7), as
$$
   {\tt Cube[Ndim+Valency]} = \log(\hbox{fraction of hypercube volume plausibly covered by atom}).\qquad
$$
This lets you draw an atom as a bell shape of assigned width, instead of as a delta function.  
It lets you produce visually smooth presentations, without damaging the resolution where it is known to be good.


\vfill\eject
\noindent{$\underline{\hbox{\bf{Section 13. MassInf prior parameters}}}$}
\bigskip

An important class of applications has data that are linear in one of the attributes of an atom, called a ``flux'' in this document.  
In fact, an atom may have several such fluxes, as in multi-colour imagery.  
I call the number of fluxes the ``{\tt Valency}'' of an atom. 
Fluxes are special attributes, additional to the ordinary {\tt Ndim} spatial coordinates, and I append them to the {\tt Cube} values.  
Thus
$$
\matrix{
    {\tt Cube[0]}\hfill& \hbox{ to\ \ }{\tt Cube[Ndim-1]}          \hfill & \hbox{ represent coordinates,}         \hfill\cr
    {\tt Cube[Ndim]}   & \hbox{ to\ \ }{\tt Cube[Ndim+Valency-1]}  \hfill & \hbox{ represent fluxes,}              \hfill\cr
    {\tt Cube[Ndim+Valency]}                                  \span\hfill & \hbox{ represents log(display width).} \quad\cr
       }
$$
If $\,{\tt Valency} > 0$, you need to choose an appropriate prior for the fluxes $z$, 
which involves three more parameters {\tt MassInf}, {\tt ProbON} and {\tt FluxUnit0}.

\bigskip
{\tt MassInf} (see section 6.1):  

\noindent {\tt MassInf} defines the type of prior to be used for the fluxes $z$.

\centerline{\vbox{\vskip 4pt
\offinterlineskip
      \halign{&\vrule#&   \strut\quad\hfil#\hfil\quad        &\vrule#&\strut\quad#\quad\hfil&\vrule#& \strut\quad#\hfil\quad                                  &\vrule# \cr
   \noalign{\hrule}                                                                                                                                                    \cr
            height2pt &           \omit                      &      &         \omit        &      &         \omit                                             &        \cr
                      &${\tt Common}\rightarrow{\tt MassInf}$&      &  \ \ \ \    Name     &      & \qquad\qquad\qquad  Description                           &        \cr
            height2pt &           \omit                      &      &         \omit        &      &         \omit                                             &        \cr
   \noalign{\hrule}                                                                                                                                                    \cr
            height2pt &           \omit                      &      &         \omit        &      &         \omit                                             &        \cr
                      &            0                         &      &   ``monkeys''        &      & ${\rm Prior}(z) = \delta(z-q)$, each $z$ equals $q$       &        \cr
            height2pt &           \omit                      &      &         \omit        &      &         \omit                                             &        \cr
                      &            1                         &      &   positive           &      & ${\rm Prior}(z) \propto \exp(-z/q)$ for $z > 0$           &        \cr
            height2pt &           \omit                      &      &         \omit        &      &         \omit                                             &        \cr
                      &            2                         &      &   positive/negative  &      & ${\rm Prior}(z) \propto \exp(-|z|/q)$ for $z$ +ve or $-$ve&        \cr
            height2pt &           \omit                      &      &         \omit        &      &         \omit                                             &        \cr
                      &            3                         &      &   Gaussian           &      & ${\rm Prior}(z) \propto \exp(-z^2 / 2 q^2)$ for all $z$   &        \cr
            height2pt &           \omit                      &      &         \omit        &      &         \omit                                             &        \cr
   \noalign{\hrule}                                                                                                                                                    \cr
          }
       } }
\noindent ${\tt MassInf}=0$ directly encodes the original ``monkey'' model which inspired maximum entropy data analysis.  
${\tt MassInf}=1$ extends this by allowing flux to vary so that signals need not be digitised to multiples of~$q$.  
${\tt MassInf}=2$ generalises by allowing flux to be negative, which is sometimes appropriate.  
The Gaussian form ${\tt MassInf}=3$ also allows flux to take either sign, but is less tolerant of outliers.

If the data are Poisson counts, instead of the standard case with Gaussian uncertainty, add 100 to {\tt MassInf}.
Thus ${\tt MassInf}=100$ is the ``monkey'' model, and ${\tt MassInf}=101$ is the ``positive'' prior, with Poisson data.
Values of 102 or 103 would be inapplicable, and are disallowed.
\bigskip
{\tt ProbON}:  

\noindent You also need to set
$$
    {\tt Common}\rightarrow{\tt ProbON} = \Pr(\hbox{individual flux is non-zero}).\ \ \qquad\qquad\qquad\qquad\qquad\qquad
$$
Usually this is 1, but you can set it less if you need to allow individual fluxes to switch off completely.

\bigskip
{\tt FluxUnit0} (see section 6.3):  

\noindent Fluxes being dimensional quantities, you also ought to supply me with an appropriate value for $q_0$;
$$
    {\tt Common}\rightarrow{\tt FluxUnit0} = \hbox{your estimate of $q_0$.}\ \qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad
$$
Ideally, you know what $q_0$ should be, and give it to me as a positive value.  
I will then copy that value to each object as its ${\tt Object}\rightarrow{\tt FluxUnit}$.  
If, instead, you only want to give me a general guide, supply it as a negative value of ${\tt Common}\rightarrow{\tt FluxUnit0}$.  
I will then allow each object's individual flux unit to vary around that value's modulus (with an  $x e^{-x}$  prior, as it happens).  
If you really don't know at all what the flux unit should be, then just give me ${\tt Common}\rightarrow{\tt FluxUnit0}$ = 0, 
and I will peek at the data to assign my own estimate (with a minus sign to allow the objects' flux units to vary).
$$
\matrix{
    {\tt Common}\rightarrow{\tt FluxUnit0} = +{\rm ve}\hfill&\Longrightarrow\quad  {\tt Object}\rightarrow{\tt FluxUnit} = \hbox{same +ve value},             \hfill\cr
    {\tt Common}\rightarrow{\tt FluxUnit0} = -{\rm ve}\hfill&\Longrightarrow\quad  {\tt Object}\rightarrow{\tt FluxUnit}\ \hbox{ guided by this (+ve) value}, \hfill\cr
    {\tt Common}\rightarrow{\tt FluxUnit0} = 0        \hfill&\Longrightarrow\quad  \hbox{the data give me a ($-$ve) value which I will impose.}               \hfill\cr
       }
$$
As with {\tt Alpha}, though, these details may not matter too much in practice.  
Extensive data can soon come to dominate this sort of prior assignment.

\vfill\eject
\noindent{$\underline{\hbox{\bf{Section 14. MassInf likelihood settings}}}$}
\bigskip

The data, being linear in the fluxes, are allowed to have either Gaussian or Poisson errors, as described in section 6.2.  
As shown in section 6.4, this means that we can integrate over and sample from the posterior for an atom's fluxes directly, without having to explore them.  
When BayeSys selects one or two atoms for attention, their fluxes can thus be ``marginalised'' away, so that only their positional coordinates need be considered.  
This makes exploration much more efficient, because the dimensionality of the problem is reduced.  
Then, when new positions are accepted, fluxes can be assigned by direct sampling.

\bigskip
{\tt UserFoot}:
\smallskip
\noindent Usually, the only data we have when using MassInf are those that are linear in the fluxes, 
and in that case we don't need the six BayeSys procedures {\tt UserEmpty}, {\tt UserTry1}, 
{\tt UserTry2}, {\tt UserInsert1}, {\tt UserInsert2} and {\tt UserDelete1} (though dummy versions are still required, returning positive codes for acceptable locations).  
All we need is a {\bf single} procedure {\tt UserFoot} that gives the footprint, or Green's function, 
of unit flux(es) at my selected location {\tt Cubes[.][0...Ndim-1]}.  
This footprint is supplied as
$$
\matrix{
    {\tt nbits[.]}\hfill &= \hbox{ length of footprint, for each Valency,}      \hfill\cr
    {\tt ibits[...]}     &= \hbox{ location of footprint fragments, serially,}  \hfill\cr
    {\tt zbits[...]}     &= \hbox{ flux of footprint fragments, serially.}      \hfill\cr
       }
$$
For example, with {\tt Valency} = 3, the following assignment
$$
\matrix{
    {\tt int\ \ \ \ nbits[3]} \hfill &= &\{&  2,    &    3,        &  3       &\};&\qquad //\ [{\tt Valency}]     \hfill\cr
    {\tt int\ \ \ \ ibits[2+3+3]}    &= &\{&15,16,  &4,\ 5,\ 6,    &9,11,12   &\};&\qquad //\ [{\tt SUM(nbits)}]  \hfill\cr
    {\tt double\    zbits[2+3+3]}    &= &\{&\ 5,\ 6,&1,\ 2,\ 3,    & 4,\ 3,\ 2&\};&\qquad //\ [{\tt SUM(nbits)}]  \hfill\cr
       }
$$
has the first of three fluxes interacting with ${\tt nbits[0]} = 2$ data elements at positions 15,16 with strengths 5,6 respectively.  
The second flux interacts with ${\tt nbits[1]} = 3$ data at positions 4,5,6 with strengths 1,2,3, 
and the third with ${\tt nbits[2]} = 3$ elements interacts with positions 9,11,12 with strengths 4,3,2. 
An atom with fluxes 1,3,2 would then contribute

$\qquad   1\times(5,6) + 3\times(1,2,3) + 2\times(4,3,2) = \{0, 0, 0, 0, 3, 6, 9, 0, 0, 8, 0, 6, 4, 0, 0, 5, 6, 0, 0,\ldots\}  $

\noindent to the mock data.  

It is a restriction of the algorithm that the different valencies of an atom do not overlap. 
If they did, I couldn't do the integrals.  
And, if you do try to give an overlapped footprint, BayeSys will abort with one of its negative return values.
BayeSys will abort even more directly if you try to supply too many fragments (${\tt SUM(nbits)} > {\tt Ndata}$),
 or if they are out of range ({\tt ibits[.]} outside {\tt [0,Ndata-1]}).
Slightly more restrictive conditions apply if you use the LifeStory2 engine, which processes two atoms at once.  
In this case, a valency from one atom must not overlap more than one valency of another.
Again, BayeSys will protect itself if you disobey the restrictions.
If your application requires such overlaps, you will have to transform the fluxes to ordinary {\tt Cube} or {\tt Coord} coordinates, 
and use BayeSys directly without the MassInf extension. 

\vfill\eject
\noindent {14.1. \it Gaussian data}
\smallskip

With Gaussian data, Massive Inference requires you to supply $D \pm \sigma$ through
$$
\matrix{  
    {\tt Common}\rightarrow{\tt Ndata}    \hfill &= {\tt Ndata};        \hfill  &//{\tt \ \# data}              \hfill  &              \cr
    {\tt Common}\rightarrow{\tt Data}     \hfill &= {\tt Data};         \hfill  &//{\tt \ data}                 \hfill  &{\tt [Ndata]} \hfill\cr
    {\tt Common}\rightarrow{\tt Acc}      \hfill &= {\tt Acc};          \hfill  &//{\tt \ accuracies = 1/sigma} \hfill  &{\tt [Ndata]}\qquad\qquad\ \  \cr
     }
$$
as suggested in section 10 for general BayeSys use.

There is an occasional variant of ordinary Gaussian errors, in which you know the relative errors, but not their absolute scale.  
In previous versions of MassInf, I used to handle this for you, separately for each individual object.  
However, you can easily do it for yourself.  
From {\tt UserMonitor}, re-scale the {\tt Acc} vector to make the $\chi^2$ misfit equal to the number of measured data (having ${\tt Acc} > 0$).  
You can do this as an ensemble-wide average, which is better than doing it individually as I used to do.  
Rescaling {\tt Acc} does not affect the algorithm's performance.

\bigskip
\noindent {14.2. \it Poisson data}
\smallskip

With Poisson data, Massive Inference requires {\tt Acc} to be set to the background $B$, 
which must be strictly positive whenever the pre-subtracted data $D+B$ are non-zero. 
If you can estimate some realistic positive level, that's fine.  
If not, assign it as some small fraction (a few \%) of your {\tt Data}, and use it anyway: you can use non-integer values.  
This ansatz amounts to pessimistically under-estimating the precision of your data by $1\over2$ of the few \% that you added as artificial background.

MassInf computations for Poisson data scale quadratically with the number of counts covered by a footprint, or even cubically if the LifeStory2 engine is used.  
This means that the program is only practical for counts of up to 100 or so per footprint.  
For larger counts, switch off LifeStory2, or approximate your Poisson errors with Gaussian ones and use the Gaussian-error option instead.

\bigskip
\noindent {14.3. \it MassInf with BayeSys}
\smallskip

You can use both BayeSys and MassInf together, if you have a problem for which the likelihood factorises as
$$
    \Pr(D\mid {\rm atoms}\lbrace x,z\rbrace) = L_{\rm Bayes}(x)\,L_{\rm MassInf}(z\mid x)
$$
where $L_{\rm MassInf}$ has Gaussian or Poisson form over the fluxes $z$.
In these cases, you need to supply {\tt UserEmpty}, {\tt UserTry1}, {\tt UserTry2}, {\tt UserInsert1}, {\tt UserInsert2} and {\tt UserDelete1} as usual
to encode the function $L_{\rm Bayes}$ of coordinates, along with {\tt UserFoot} to give the footprint of an atom so that $L_{\rm MassInf}$ can be used alongside it.

\vfill\eject
\noindent{$\underline{\hbox{\bf{Section 15. Using BayeSys}}}$}
\bigskip

After setting the input parameters and data pointers in the {\tt CommonStr} and {\tt ObjectStr} structures, the command to run the BayeSys program is simply
$$
    {\tt return\_value\ =\ BayeSys3(\ Common,\ Objects\ );}\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\qquad\quad
$$
The suffix ``3'' in {\tt BayeSys3} marks the $3^{\rm rd}$ principal version of the BayeSys program, in a series started with {\tt BayeSys1} in 1999,
itself the descendant of various {\tt MemSys} maximum entropy and {\tt MassInf} massive inference programs.
If {\tt Valency} was set to 0, you will have specified the likelihood function through the 
{\tt UserEmpty}, {\tt UserTry1}, {\tt UserTry2}, {\tt UserInsert1}, {\tt UserInsert2}, {\tt UserDelete1} procedures, 
and also supplied a dummy {\tt UserFoot} (which won't be called).
Otherwise, with positive {\tt Valency}, you will have specified an atom's footprint through the {\tt UserFoot} procedure, 
and also supplied {\tt UserEmpty}, {\tt UserTry1}, {\tt UserTry2}, {\tt UserInsert1}, {\tt UserInsert2}, {\tt UserDelete1}
(which for pure massive inference will all have been dummy procedures with positive return values).
$$
\matrix{
     & &
       {\vbox{\offinterlineskip
             \halign{  \vrule# & \strut\ \ \hfil#\hfil\ & \vrule# \cr
                     \noalign{\hrule}
             height2pt         &        \omit               &     \cr
                               &         Start              &     \cr
                               &Initialise parameters $\phi$&     \cr
             height2pt         &        \omit               &     \cr
                     \noalign{\hrule}
                    }
             \vskip 6pt
             }
       }
     & &
        \cr
     & &
       \Biggl\downarrow & & \cr
       {\vbox{\offinterlineskip
            \halign{  \vrule# & \strut\ \ \hfil#\hfil\ & \vrule# \cr
                     \noalign{\hrule}
            height2pt         &          \omit              &         \cr
                              &     Response to new         &         \cr
                              &     positions or            &         \cr
                              &     numbers of atoms        &         \cr
            height2pt         &          \omit              &         \cr
                     \noalign{\hrule}
                        \omit  &  {\tt User procedures}     &  \omit  \cr
                   }
          }
       }
       &
       {\vbox{\offinterlineskip
            \halign{ \strut\ \hfil#\hfil\  \cr
                     \leftarrowfill        \cr
                     \phantom{xxxxxx}      \cr
                     \rightarrowfill       \cr
                   }
             \vskip 8pt
             }
       }
       &
       {\vbox{\offinterlineskip
             \halign{  \vrule width2pt# & \strut\ \ \hfil#\hfil\ & \vrule width2pt# \cr
                     \noalign{\hrule height2pt}
             height6pt         &          \omit            &         \cr
                               &     Atoms of new object   &         \cr
                               & $\theta$, given $\phi$    &         \cr
             height6pt         &          \omit            &         \cr
                     \noalign{\hrule height2pt}
                        \omit  &  {\tt BayeSys/MassInf}    &  \omit  \cr
                   }
             }
       }
       &
       {\vbox{\offinterlineskip
            \halign{ \strut\ \hfil#\hfil\  \cr
                     \rightarrowfill       \cr
                     \phantom{xxxxxx}      \cr
                     \leftarrowfill        \cr
                   }
             \vskip 8pt
             }
       }
       &
       {\vbox{\offinterlineskip
            \halign{  \vrule# & \strut\ \ \hfil#\hfil\  & \vrule# \cr
                     \noalign{\hrule}
            height2pt         &       \omit             &         \cr
                              &  Display object $\theta$&         \cr
                              &  Collect statistics     &         \cr
                              & (New parameters $\phi$) &         \cr
            height2pt         &       \omit             &         \cr
                     \noalign{\hrule}
                        \omit  &  {\tt UserMonitor}     &  \omit  \cr
                   }
          }
       }
         \cr
     & & \Biggl\downarrow & & \cr
     & &
       {\vbox{\offinterlineskip
             \halign{  \vrule# & \strut\ \ \hfil#\hfil\ & \vrule# \cr
                     \noalign{\hrule}
             height3pt         &        \omit           &         \cr
                               &        Finish          &         \cr
             height2pt         &        \omit           &         \cr
                     \noalign{\hrule}
                    }
             \vskip 6pt
             }
       }
     & &
        \cr
     }
$$

Once each iterate, {\tt BayeSys3} calls your {\tt UserMonitor} procedure, where you monitor the calculation, 
collect your statistical inferences, and calibrate any parameters you need.
{\tt UserMonitor} should return with a code of 0 until you wish to finish a run, when it should return with a positive code.
{\tt BayeSys3} will then finish, and return with that positive code as its own return value.
Before returning, it will de-allocate all its memory allocations, including the {\tt Cubes} arrays which specified its sample objects.
This means that you must use {\tt UserMonitor} to extract all the information you want about the objects before instructing {\tt BayeSys3} to finish.

As it proceeds, {\tt BayeSys3} accumulates progress diagnostics {\tt CPU} and {\tt Success} in its {\tt CommonStr} structure.
{\tt CPU} gives the effective number of single-atom calls to the Try/Insert/Delete procedures, a measure of the overall computational load.
{\tt Success} gives the number of these calls which resulted in changing an object usefully, the definition of which is admittedly arbitrary.
The ratio {\tt Success}/{\tt CPU} is a measure of exploration efficiency, necessarily less than 1 because of the nature of slice sampling.
Ratios in the range 0.01 to 0.1 seem typical, with ratios less than 0.01 being disappointing. 

If you detect an error state in one of your procedures, return from it with a negative return value.
{\tt BayeSys3} will unwind cleanly, and return with that value as its own return value.
It will do the same, with its own negative return value from the list in bayesys3.h, if it detects an internal error itself.

\vfill\eject
\noindent{$\underline{\hbox{\bf{Section 16. Program files}}}$}
\bigskip

The kernel procedures of BayeSys and MassInf are supplied in the bayesys3.c library source file, with its header bayesys3.h.
BayeSys uses a variety of random-number and related procedures, which are provided in the random.c library file with its header random.h.  
The other source that needs to be linked is hilbert.c with header hilbert.h, 
which contains efficient procedures for moving back and forth between multi-dimensional coordinates 
and the one-dimensional extended-precision Hilbert line over which the BayeSys exploration engines operate.  
The hilbert.c library also contains linked-list procedures for ordering atoms along this Hilbert curve.
You may find some of the random or hilbert procedures independently useful.

Finally, BayeSys allows parallel processing of its exploration engines, 
insofar as this mode of operation does not interfere with the detailed balance of correct probabilistic analysis.  
To use this, edit the line

\indent\qquad    {\tt \#define\ PARALLEL\ 0}

\noindent of bayesys3.c to assign whatever number of parallel (extra) processors should be available, 
re-test the application in what will be simulation mode with or without the {\tt FLOWCHECK} printed diagnostics, then activate the line

\indent\qquad    {\tt \#define\ THREADS}

\noindent and try to merge the code into your parallel environment. No guarantees: parallelism is non-standard.

I supply three test programs:

\indent\qquad\quad    bayestoy.c is a BayeSys application which happens to have Gaussian data,

\indent\qquad\quad    masstoy.c  is a MassInf application with the same Gaussian data,

\indent\qquad\quad    poisstoy.c is a MassInf application with Poisson data.

\noindent I also give the log file of each run (on my own hardware).  
The tests are constructed so that bayestoy and masstoy should produce identical long-term statistics 
(though you will have to reduce {\tt Rate} to check that all the details 
really are looking identical).  
Although {\tt Data} and {\tt Acc} have the same numerical values in all tests, the interpretations differ.  
With Gaussian data, the signals are (${\tt Data} \pm 1/{\tt Acc}$), whereas in the Poisson case {\tt Acc} represents a background, 
so that $({\tt Data} + {\tt Acc})$ counts are to be identified with $({\rm signal} + {\rm background})$, with corresponding square-root-type errors being implied.
The test programs should be linked with bayesys3.c and their supporting {\tt UserXXX} procedures in bayesapp.c and massapp.c.
$$
\matrix{
    \hbox{bayestoy}    &+ &\hbox{bayesys3+random+hilbert} &+ &\hbox{bayesapp} &\Longrightarrow &\hbox{\ bayestoy  executable}  \hfill\cr
    \hbox{masstoy}     &+ &\hbox{bayesys3+random+hilbert} &+ &\hbox{massapp}  &\Longrightarrow &\hbox{\ masstoy\  executable}  \hfill\cr
    \hbox{poisstoy}    &+ &\hbox{bayesys3+random+hilbert} &+ &\hbox{massapp}  &\Longrightarrow &\hbox{\ poisstoy  executable}  \hfill\cr
       }
$$

\noindent Files supplied:
$$
\matrix{
   \hbox{bayestoy.c \ masstoy.c \ poisstoy.c \ userstr.h}                      \hfill\qquad\qquad        *\hbox{USER}* \cr
   \hbox{bayesys3.c \ bayesys3.h \ random.c \ random.h \ hilbert.c \ hilbert.h}\hfill\qquad\qquad   *\hbox{LIBRARIES}* \cr
   \hbox{bayesapp.c \ massapp.c}                                               \hfill\qquad\qquad        *\hbox{USER}* \cr
   \hbox{bayestoy.log \ masstoy.log \ poisstoy.log}                            \hfill\qquad\qquad     *\hbox{RESULTS}* \cr
   \hbox{license.txt}  \hfill\qquad\qquad  *\hbox{GNU Lesser General Public License under which the libraries are distributed}* \cr
       }
$$

\noindent These toy programs are designed to be simple and illustrative rather than efficient, to encourage you to write your own applications.
\bigskip
Best wishes!

\vfill\eject
\centerline{\bf REFERENCES}
\bigskip

\noindent Abend, K., Hartley, T.J. and Kanal, L.N. (1965) ``Classification of binary random patterns'',

 IEEE Trans. Information Theory {\bf IT--11}, 538--544.

\noindent Bially, T. (1969) ``Space-filling curves, their generation and their application to bandwidth reduction'',

 IEEE Trans. Information Theory, {\bf IT--15}, 658--664.

\noindent Brooks, S.P. (1998) ``Markov chain Monte Carlo and its applictions'',

 The Statistician, {\bf 47}, 69--100.

\noindent Butz, A.R. (1969) ``Convergence with Hilbert's space-filling curve'',

 J. Comput. Sys. Sci. {\bf 3}, 128--146.

\noindent Butz, A.R. (1971) ``Alternative algorithm for Hilbert's space-filling curve'',

 IEEE Trans. Computers, {\bf 20}, 424--426.

\noindent Cox, R.T. (1946) ``Probability, Frequency, and Reasonable Expectation'',

 Amer. J. Phys, {\bf 14}, 1--13.

\noindent Dawid, A.P., Stone, M. and Zidek, J.V. (1973) ``Marginalization paradoxes in Bayesian and

 structural inference'', J. Roy Statist. Soc. B{\bf 35}, 189-233.

\noindent Duane, S., Kennedy, A.D., Pendleton, B.J. and Roweth, D. (1987) ``Hybrid Monte Carlo'',

 Phys. Lett. B{\bf 195}, 216--222.

\noindent Feller, W. (1971) ``An introduction to probability theory and its applications'',

 vol 2, Wiley, New York.

\noindent Ferguson, T.S. (1973) ``A Bayesian analysis of some nonparametric problems'',

 Annals of Statistics, {\bf 1}, 209--230.

\noindent Gelman, A. and Meng, X.-L. (1998) ``Simulated annealing: from importance sampling to bridge sampling

 to path sampling'', Statistical Science, {\bf 13}, 163--185.

\noindent Geman, S. and Geman, D. (1984) ``Stochastic relaxation, Gibbs distribution, and the Bayesian restoration

 of images'', IEEE Trans. Pattn. Anal. Mach. Intell. {\bf 6}, 721--741.

\noindent Goggans, P.M. and Chi, Y. (2004) ``Using thermodynamic integration to calculate the posterior probability

 in Bayesian model selection problems'', {\it Bayesian inference and maximum entropy methods in science

 and engineering}, 23rd workshop, Jackson, Wyoming, ed. G.J. Erickson.

\noindent Green, P.J (1994) ``Discussion on representations of knowledge in complex systems

 (by U. Grenander and M.I. Miller)'', J. Roy Statist. Soc. B{\bf 56}, 589-590.

\noindent Grenander, U. and Miller, M.I. (1994) ``Representations of knowledge in complex systems (with discussion)'', 

J. Roy. Statist. Soc. B{\bf 56}, 549--603.

\noindent Gull, S.F. and Daniell, G.J. (1978) ``Image reconstruction from incomplete and noisy data'',

 Nature {\bf 272}, 686--690.

\noindent Gull, S.F. and Skilling, J. (1984) ``The Maximum Entropy Method'' 

 in {\it Indirect Imaging}, J.A. Roberts (ed.), Cambridge Univ. Press, 267--279.

\noindent Hastings, W.K. (1970) ``Monte Carlo sampling methods using Markov chains and their applications'',

 Biometrika {\bf 57}, 97--109.

\noindent Hilbert, D. (1891) ``\"Uber die stetige abbildung einer linie auf ein fl\"achenst\"uck'',

 Mathematische Annalen {\bf 38}, 459--460.

\noindent Jaynes, E.T. (1986) ``Monkeys, Kangaroos, and $N$'' in {\it Maximum Entropy and Bayesian Methods in

 Applied Statistics}, J.H. Justice (ed.), Cambridge Univ. Press, 26--58.

\noindent Jaynes, E.T. (2003) ``{\it Probability Theory, the Logic of Science}'', ed. G.L. Bretthorst,

 Cambridge Univ. Press.

\noindent Kirkpatrick, S., Gelatt, C.D. and Vecchi, M.P. (1983) ``Optimization by simulated annealing'',

 Science, {\bf 220}, 671--680.

\noindent Lawder, A.K. (2000)  ``Calculation of mappings between one and n-dimensional values using the

 Hilbert space-filling curve'',  Research report JL1/00,

 School of Computer Science and Information Systems, Birkbeck College, Univ. London.

\noindent MacKay, D.J.C. (2003) ``{\it Information Theory, Inference, and Learning Algorithms}'',

 Cambridge Univ. Press.
\vfill\eject
\noindent Marinari, E. and Parisi, G. (1992) ``Simulated tempering: a new Monte Carlo scheme'',

 Europhys. Lett. {\bf 19}, 451--458.

\noindent Metropolis, N., Rosenbluth, A.W., Rosenbluth, M.N., Teller, A.H. and Teller, E. (1953)

 ``Equation of state by fast computing machines'', J. Chemical Physics {\bf 21}, 1087--1092.

\noindent Neal, R.M. (1993) ``An improved acceptance procedure for the hybrid Monte Carlo algorithm'',

 J. Computational Physics {\bf 111}, 194--203.

\noindent Neal, R.M. (2001) ``Annealed importance sampling'',

 Statistics and Computing, {\bf 11}, 125--139.

\noindent Neal, R.M. (2003) ``Slice sampling'',

 Annals of Statistics {\bf 31}, 705--767.

\noindent Otten, R.H.J.M. and van Ginneken, L.P.P.P. (1984) ``Floorplan design using simulated annealing'',

 IEEE International Conference on Computer-Aided Design {\bf ICCAD-84}.

\noindent Phillips, D.B. and Smith, A.F.M. (1996) ``Bayesian model comparison via jump diffusions''

 in {\it Practical Markov chain Monte Carlo}

 eds. W.R. Gilks, S. Richardson and D.J. Spiegelhalter, Chapman and Hall, London.

\noindent Propp, J.G. and Wilson, D.B. (1996) ``Exact sampling with coupled Markov chains and applications to

 statistical mechanics'', Rand. Struct. Alg. {\bf 9}, 223--252.

\noindent Richardson, S, and Green, P.J. (1997) ``On Bayesian analysis of mixtures with an unknown number

 of components'' (with discussion). J. Roy. Statist. Soc. B{\bf 59}, 731--792.

\noindent Roberts, G.O. and Smith, A.F.M. (1994) ``Simple conditions for the convergence of the Gibbs sampler and

 Metropolis-Hastings algorithm'', Biometrika, {\bf 83}, 95--110.

\noindent Sagan, H. (1994) ``Space-Filling Curves'',

 Springer-Verlag, New York.

\noindent Shore, J.E. and Johnson, R.W. (1980) ``Axiomatic Derivation of Maximum Entropy and the Principle of

 Minimum Cross-Entropy''. IEEE Trans. {\bf IT--26} 26--37; and (1983) {\bf IT--29} 942--943.

\noindent Sibisi, S. and Skilling, J. (1997) ``Prior distributions on measure space'',

 J. Roy. Statist. Soc. B{\bf 59}, 217--235.

\noindent Sivia, D. (1996) ``Data analysis: a Bayesian tutorial'',

 Oxford Univ. Press.

\noindent Skilling, J. (2004a) ``Programming the Hilbert Curve'', {\it Bayesian inference and maximum entropy

 methods in science and engineering}, 23rd workshop, Jackson, Wyoming, ed. G.J. Erickson.

\noindent Skilling, J. (2004b) ``Using the Hilbert Curve'', {\it Bayesian inference and maximum entropy

 methods in science and engineering}, 23rd workshop, Jackson, Wyoming, ed. G.J. Erickson.

\noindent Skilling, J. and MacKay, D.J.M. (2003) ``Discussion on slice sampling (by R.M. Neal)'',

 Annals of Statistics, {\bf 31}, 753--754.

\noindent Smith, A.F.M. and Roberts, G.O. (1993) ``Bayesian computation via the Gibbs sampler and related

 Markov chain Monte Carlo methods'', J. Roy. Statist. Soc. B{\bf 55}, 3--23.

\noindent Song, Z. and Roussopoulos, N. (2002) ``Using Hilbert curve in image storing and receiving'',

 Information Systems, {\bf 27(8)}, 523--536.

\noindent Stevens, R.J., Lehar, A. and Preston, F.H. (1983) ``Manipulation and presentation of multidimensional

 images using the Peano scan'', IEEE Trans. Pattern Analysis and Machine Intelligence, {\bf 5}, 520--526.

\noindent Swendsen, R.H. and Wang, J.-S. ``Replica Monte Carlo simulation of spin glasses'',

 (1986) Phys. Rev. Lett. {\bf 57}, 2607--2609.

\noindent Tikhonov, A.N. and Arsenin, V.Y. (1977) ``{\it Solutions of Ill-posed Problems}'',

  London, Wiley.

\noindent Titterington, D.M., Smith, A.F.M. and Makov, U.E. (1985)

 ``{\it Statistical analysis of finite mixture distributions}'', Wiley, Chichester.

\vfill\eject

\centerline{\bf INDEX}
\bigskip
\halign{                                      #             \quad\qquad & \qquad                                       #                          \cr
{\tt Acc}                            \quad\hfill 42,48,50               & {\tt Mock}                           \quad\hfill 43                     \cr
{\tt Alpha}                          \quad\hfill 22,40                  & {\tt Natoms}                         \quad\hfill 43,44                  \cr
{\tt BayeShape}                      \quad\hfill 23,40                  & {\tt nbits}                          \quad\hfill 47                     \cr
{\tt BayeSys3}                       \quad\hfill 49                     & {\tt Ndata}                          \quad\hfill 42,48                  \cr
{\tt Common}                         \quad\hfill 42,45,46,48,49         & {\tt Ndim}                           \quad\hfill 40,46,47               \cr
{\tt CommonStr}                      \quad\hfill 42,45,49               & {\tt Nsystem}                        \quad\hfill 45                     \cr
{\tt cool}                           \quad\hfill 45                     & {\tt Object}                         \quad\hfill 43,46                  \cr
{\tt Coord}                          \quad\hfill 23,40                  & {\tt Objects}                        \quad\hfill 43,49                  \cr
{\tt CPU}                            \quad\hfill 49                     & {\tt ObjectStr}                      \quad\hfill 43,45                  \cr
{\tt Cube}                           \quad\hfill 23,40,43,46,47         & {\tt ProbON}                         \quad\hfill 46                     \cr
{\tt Cubes}                          \quad\hfill 40,43,44,49            & {\tt Rate}                           \quad\hfill 17,41,50               \cr
{\tt Data}                           \quad\hfill 42,48,50               & {\tt Success}                        \quad\hfill 49                     \cr
{\tt ENSEMBLE}                       \quad\hfill 26,41,43               & {\tt UserCommon}                     \quad\hfill 42                     \cr
{\tt Evidence}                       \quad\hfill 44,45                  & {\tt UserDelete1}                    \quad\hfill 44,47,48,49            \cr
{\tt FluxUnit}                       \quad\hfill 46                     & {\tt UserEmpty}                      \quad\hfill 44,47,48,49            \cr
{\tt FluxUnit0}                      \quad\hfill 46                     & {\tt UserFoot}                       \quad\hfill 44,47,48,49            \cr
{\tt ibits}                          \quad\hfill 47                     & {\tt UserInsert1}                    \quad\hfill 44,47,48,49            \cr
{\tt Information}                    \quad\hfill 45                     & {\tt UserInsert2}                    \quad\hfill 44,47,48,49            \cr
{\tt Iseed}                          \quad\hfill 41                     & {\tt UserMonitor}                    \quad\hfill 21,45,47,48            \cr
{\tt Lhood}                          \quad\hfill 43                     & {\tt UserObject}                     \quad\hfill 43                     \cr
{\tt MassInf}                        \quad\hfill 46                     & {\tt UserTry1}                       \quad\hfill 44,47,48,49            \cr
{\tt MaxAtoms}                       \quad\hfill 22,40                  & {\tt UserTry2}                       \quad\hfill 44,47,48,49            \cr
{\tt Method}                         \quad\hfill 26,41,44               & {\tt Valency}                        \quad\hfill 35,38,40,46,47,49      \cr
{\tt MinAtoms}                       \quad\hfill 22,40                  & {\tt zbits}                          \quad\hfill 47                     \cr
                                                                        &                                                                         \cr
acceptance probability ($A$)         \quad\hfill 12                     & birth of atom                        \quad\hfill 9,10,27,29             \cr
accuracy ($1/\sigma$)                \quad\hfill 42                     & Boltzmann's constant ($k$)           \quad\hfill 15                     \cr
additive quantity (see flux)         \quad\hfill 3                      & British Isles                        \quad\hfill 4,6                    \cr
aesthetics                           \quad\hfill 39                     & canonical ensemble                   \quad\hfill 19                     \cr
affine transformation                \quad\hfill 31                     & cell                                 \quad\hfill 6,36,37                \cr
analogue scheme                      \quad\hfill 13,30                  & census unit                          \quad\hfill 6                      \cr
anisotropy of transformation         \quad\hfill 23,25                  & Chameleon1                           \quad\hfill 26,33                  \cr
annealed importance sampling         \quad\hfill 17                     & Chameleon2                           \quad\hfill 26,34                  \cr
annealing                            \quad\hfill 15,17,18,41,45         & chemical potential ($\mu$)           \quad\hfill 19,20                  \cr
annealing paths                      \quad\hfill 20                     & chisquared ($\chi^2$)                \quad\hfill 37,48                  \cr
annealing schedule                   \quad\hfill 16                     & circular symmetry                    \quad\hfill 36                     \cr
aperiodic                            \quad\hfill 9                      & clock time                           \quad\hfill 41                     \cr
apparent cooling                     \quad\hfill 18                     & complexity, of object                \quad\hfill 23                     \cr
artificial time                      \quad\hfill 10,27,34               & composite state                      \quad\hfill 27                     \cr
atomic prior                         \quad\hfill 6                      & computation time                     \quad\hfill 26,49                  \cr
atoms                            \quad\hfill 6,9,10,22,23,26--34,36--49 & confidence interval                  \quad\hfill 2                      \cr
background ($B$)                     \quad\hfill 37,38,48               & conjugate gradient                   \quad\hfill 30                     \cr
backwards (rate, transition)         \quad\hfill 33                     & consistent inference                 \quad\hfill 2,5                    \cr
barrier                              \quad\hfill 11,14                  & context ($I$)                        \quad\hfill 5                      \cr
Bayesian System                      \quad\hfill 2,21                   & continuity                           \quad\hfill 11                     \cr
Bayesian methods                     \quad\hfill 2,7                    & convergence                          \quad\hfill 27                     \cr
BayeSys prior                        \quad\hfill 22,40                  & cooling (see annealing)              \quad\hfill 15,18                  \cr
BayeSys program                      \quad\hfill 21,49                  & coolness ($\lambda$)                 \quad\hfill 15,17,18,19,45         \cr
BayeSys structure                    \quad\hfill 42                     & coordinates                          \quad\hfill 7,10,23,46             \cr
bell                                 \quad\hfill 23                     & copy                                 \quad\hfill 17                     \cr
Bessel function ($I_1$)              \quad\hfill 7,36                   & corners of cube                      \quad\hfill 23,24,25               \cr
binomial prior                       \quad\hfill 6,22                   & cubic symmetry                       \quad\hfill 25                     \cr
cumulant                             \quad\hfill 7,23,24,25             & Gray code                            \quad\hfill 10                     \cr
cumulant weight                      \quad\hfill 17                     & Green's function (see footprint)     \quad\hfill 36                     \cr
data ($D$)                       \quad\hfill 2,4,5,36,37,38,42,46,47,48 & GuidedWalk                           \quad\hfill 26,30,32               \cr
death of atom                       \quad\hfill 9,10,27,29              & hemisphere                           \quad\hfill 23,25                  \cr
deletion                             \quad\hfill 44                     & Hilbert curve                       \quad\hfill 2,10,11,23,26--34,39,50 \cr
delta function ($\delta$)            \quad\hfill 36,39                  & historical log                       \quad\hfill 39                     \cr
detailed balance                     \quad\hfill 9,10,12,13,26--34      & hybrid Monte Carlo                   \quad\hfill 30                     \cr
differentiability                    \quad\hfill 11                     & hypercube                   \quad\hfill 8,10,11,13,23,24,25,26,31,33,39 \cr
digital scheme                       \quad\hfill 10,13                  & hyperparameter                       \quad\hfill 6,35                   \cr
dimension-changing methods           \quad\hfill 10                     & hyperplane                           \quad\hfill 23                     \cr
dimensional data                     \quad\hfill 7,35                   & image                                \quad\hfill 35                     \cr
dimensionality                       \quad\hfill 2,23,40                & improper prior                       \quad\hfill 8                      \cr
Diophantine subtleties               \quad\hfill 33                     & incomplete gamma function ($P$)      \quad\hfill 23,25                  \cr
Dirichlet distribution               \quad\hfill 6                      & inference (as methodology)           \quad\hfill 2,4,5,9                \cr
domain size                          \quad\hfill 13                     & inference, statistical               \quad\hfill 45,49                  \cr
efficiency               \quad\hfill 3,12,13,16,17,20,24,27,41,44,47,49 & infinitely divisible                 \quad\hfill 7                      \cr
eigenvector                          \quad\hfill 9                      & information (ancillary $I$)          \quad\hfill 5                      \cr
empty object                         \quad\hfill 22,37                  & information (negative entropy $H$)   \quad\hfill 15,16,19               \cr
engines                              \quad\hfill 2,21,26--34            & insertion                            \quad\hfill 44                     \cr
England                              \quad\hfill 4,6                    & integrable distribution              \quad\hfill 38                     \cr
ensemble (${\cal N}$)                \quad\hfill 15,17,26,27,41         & internal error                       \quad\hfill 49                     \cr
ensemble (prior, likelihood, posterior of) \quad\hfill 26               & inversion                            \quad\hfill 4                      \cr
ensemble (membership)                \quad\hfill 34                     & Ireland                              \quad\hfill 4,7                    \cr
equilibrium                          \quad\hfill 15,18,27               & irreducibility                       \quad\hfill 9,26,33                \cr
error function (erf)                 \quad\hfill 38                     & Jacobian                             \quad\hfill 24,40                  \cr
error state                          \quad\hfill 40,44,49               & joint (probability distribution)     \quad\hfill 5,21,37,38             \cr
evidence ($E$)                       \quad\hfill 5,15                   & jump-diffusion                       \quad\hfill 9                      \cr
exact sampling                       \quad\hfill 16                     & Leapfrog1                            \quad\hfill 26,32                  \cr
experiment ($R$)                     \quad\hfill 4                      & Leapfrog2                            \quad\hfill 26,32                  \cr
exponential distribution             \quad\hfill 7,35                   & least squares                        \quad\hfill 4                      \cr
exponential growth                   \quad\hfill 31,32                  & L\'evy-Khinchin representation       \quad\hfill 36                     \cr
extended precision                   \quad\hfill 2,11                   & LifeStory1                           \quad\hfill 26,27                  \cr
faithful representation              \quad\hfill 8                      & LifeStory2                           \quad\hfill 26,29,38,41,44,47,48   \cr
feature of likelihood                \quad\hfill 30                     & lifetime of atom                     \quad\hfill 10                     \cr
finite resolution                    \quad\hfill 39                     & likelihood ($L$)         \quad\hfill 2,5,12--19,21,26--34,36--39,41--48 \cr
flux (quantity $z$)                \quad\hfill 3,7,21,35,36,37,38,46,47 & linear data                          \quad\hfill 35                     \cr
flux unit ($q$)                      \quad\hfill 35,37,46               & linked list                          \quad\hfill 3,50                   \cr
footprint ({\bf f})                  \quad\hfill 36,38,47               & local maxima                         \quad\hfill 18,41                  \cr
forwards (rate, transition)          \quad\hfill 33                     & local shape                          \quad\hfill 30                     \cr
fragment, of footprint               \quad\hfill 47                     & locality                             \quad\hfill 2,11                   \cr
gamma distribution                   \quad\hfill 6                      & MAPP                                 \quad\hfill 8,36                   \cr
gamma function ($\Gamma$)            \quad\hfill 37                     & mapping                              \quad\hfill 24                     \cr
Gaussian data                        \quad\hfill 37,38,48,50            & Markov chain                         \quad\hfill 8,15                   \cr
Gaussian distribution                \quad\hfill 25                     & MassInf flux ($z$)                   \quad\hfill 38                     \cr
Gaussian errors                      \quad\hfill 35,37,47               & MassInf flux unit ($q$)              \quad\hfill 37                     \cr
Gaussian prior                       \quad\hfill 35,46                  & MassInf likelihood ($L$)             \quad\hfill 36,37,47               \cr
Gaussian statistics                  \quad\hfill 8                      & MassInf prior                        \quad\hfill 35,36,46               \cr
generator, Hilbert curve             \quad\hfill 10                     & Massive Inference                    \quad\hfill 3,21,35                \cr
generator, random number             \quad\hfill 41                     & maximum a-posteriori probability     \quad\hfill 8                      \cr
geometric prior                      \quad\hfill 6,9,22,40              & maximum entropy                      \quad\hfill 4,5,6,35,46            \cr
Gibbs sampling                       \quad\hfill 21,26,37               & maximum likelihood                   \quad\hfill 7,15,45                \cr
GNU lesser general public license    \quad\hfill 50                     & MCMC (Markov chain Monte Carlo)      \quad\hfill 9,12,26                \cr
grand canonical ensemble             \quad\hfill 20                     & mean                                 \quad\hfill 8                      \cr
measure                              \quad\hfill 20,36                  & program files                        \quad\hfill 50                     \cr
median                               \quad\hfill 8                      & pseudo-inverse                       \quad\hfill 4                      \cr
Metropolis-Hastings                  \quad\hfill 12,14,27,32            & random display                       \quad\hfill 39                     \cr
mixture distribution                 \quad\hfill 6                      & random number generator              \quad\hfill 41                     \cr
mixture model                        \quad\hfill 40                     & random number procedures             \quad\hfill 50                     \cr
ML (see maximum likelihood)          \quad\hfill 8                      & random sample                        \quad\hfill 8                      \cr
mock data ($F$)                      \quad\hfill 36,37,38,43,44,47      & random scatter                       \quad\hfill 6                      \cr
mode                                 \quad\hfill 8                      & random transition ($T$)              \quad\hfill 9,10,12,13,27--34      \cr
monkey prior                         \quad\hfill 35,46                  & random variable                      \quad\hfill 17                     \cr
Monte Carlo algorithm                \quad\hfill 2                      & regularisation                       \quad\hfill 4                      \cr
movement                             \quad\hfill 27,29                  & replica exchange                     \quad\hfill 16                     \cr
multimodal problem                   \quad\hfill 17,18                  & return value                         \quad\hfill 40,44,47,49            \cr
multiplicity                         \quad\hfill 17                     & reversible-jump dynamics             \quad\hfill 9                      \cr
needle in haystack                   \quad\hfill 15                     & sampling                             \quad\hfill 8,9,15,21,38,45,47     \cr
neighbour atoms                      \quad\hfill 28,29,30,31,32,34      & sampling period ($\tau$)             \quad\hfill 10                     \cr
nested domains (${\cal D}$)          \quad\hfill 14                     & selective annealing                  \quad\hfill 3,17                   \cr
noise in data                        \quad\hfill 5                      & separability                         \quad\hfill 38                     \cr
normal cumulant (${\cal N}^{-1}$)    \quad\hfill 24                     & simplex                              \quad\hfill 23,24                  \cr
nuisance parameters ($\phi$)         \quad\hfill 5,21,45                & simulated tempering                  \quad\hfill 16                     \cr
null hypothesis                      \quad\hfill 2                      & singularities                        \quad\hfill 23                     \cr
object ($\theta$)                    \quad\hfill 2,4                    & slice sampling                    \quad\hfill 3,13,14,28,29,30,31,34,49 \cr
orthodox statistics                  \quad\hfill 2                      & smooth display                       \quad\hfill 39,45                  \cr
out-of-range location                \quad\hfill 15                     & soluble problems                     \quad\hfill 27                     \cr
over-intelligent user                \quad\hfill 25                     & space-filling curve                  \quad\hfill 3,10                   \cr
paradox                              \quad\hfill 5,11                   & sphere                               \quad\hfill 23,25                  \cr
parallel processors                  \quad\hfill 50                     & splitting and combination moves      \quad\hfill 29                     \cr
parity bit                           \quad\hfill 25                     & staircase                            \quad\hfill 31                     \cr
permutation                          \quad\hfill 23                     & standard deviation                   \quad\hfill 8,19,37,42             \cr
phase change                         \quad\hfill 19                     & statistical inference                \quad\hfill 45,49                  \cr
physics                              \quad\hfill 16                     & statistical thermodyamics            \quad\hfill 19                     \cr
point-spread-function (see footprint)\quad\hfill 36                     & stepping out                         \quad\hfill 14                     \cr
Poisson approximation                \quad\hfill 37                     & subjective assignment                \quad\hfill 5                      \cr
Poisson counts                       \quad\hfill 36,46                  & surface shape                        \quad\hfill 23                     \cr
Poisson data                         \quad\hfill 37,38,42,48,50         & symmetry                             \quad\hfill 5,13,24,25,36          \cr
Poisson errors                       \quad\hfill 35,47                  & temperature                          \quad\hfill 15                     \cr
Poisson prior                        \quad\hfill 6,9,10,22,23,35,40     & test programs                        \quad\hfill 50                     \cr
polar angle ($\psi$)                 \quad\hfill 25                     & thermodynamic integration            \quad\hfill 16                     \cr
polar coordinates                    \quad\hfill 25                     & thermodynamics                       \quad\hfill 15                     \cr
polynomial                           \quad\hfill 37,38                  & topology                             \quad\hfill 2,10,26                \cr
positive orthant                     \quad\hfill 23                     & transition                           \quad\hfill 9--14,27,33,34         \cr
positive prior                       \quad\hfill 35,46                  & trial location                       \quad\hfill 14,30,32               \cr
positive/negative prior              \quad\hfill 35,36,46               & tribe                                \quad\hfill 6,7                    \cr
posterior distribution               \quad\hfill 5                      & truncated Gaussian                   \quad\hfill 38                     \cr
precision (arithmetical)             \quad\hfill 2,6                    & tunnelling                           \quad\hfill 18                     \cr
principal eigenvector                \quad\hfill 9                      & uniform prior                        \quad\hfill 6,9,22,40              \cr
prior distribution                   \quad\hfill 5                      & unsigned integers                    \quad\hfill 10                     \cr
prior predictive (see evidence)      \quad\hfill 5                      & vertices of simplex                  \quad\hfill 24                     \cr
probability calculus                 \quad\hfill 5                      & weights ($w$)                        \quad\hfill 17                     \cr
       }
\vfill\eject
\bye


