\documentstyle[IEEEtran]{article}
\begin{document}
\tolerance 10000
\title{Robust Minimax Adaptive Approach to Regression Problems
in Interval Computations}

\author{Georgiy L. Shevlyakov}

\pagestyle{myheadings}

\markright{APIC'95, El Paso,
Extended Abstracts,
A Supplement to the international journal of {\rm Reliable
Computing}\ \ \ \ \ \ \ \ \ \ \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
\ \ \ \ \ \ \ \ \ \ \ \ \ }

\maketitle

\auffil{The author is with Department of Mathematics,  
St. Petersburg State Technical University,
St. Petersburg, 195251,  Russia.}

\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}


\section{Introduction}
One of the basic approaches to the synthesis of robust
estimation procedures is the minimax principle. In this case, in a given
class of densities, the least favorable one (which minimizes the
so-called Fisher
information, see definitions below) 
is determined. The unknown parameters of a regression model are
then estimated by means of the maximum likelihood method
for this density \cite{1}.

As a result, if we know the set of possible 
deviations of the actual probability distribution from the model that we
are currently using, then we can 
construct {\it robust}
statistical  procedures, i.e., procedures which are stable  with  respect   to
(possible) deviations  from  apriori distribution model.

The  robust minimax procedures provide a guaranteed level of
the  estimator's  accuracy  (measured  by the supremum of an
asymptotic variance)  for any density  distribution of a given
class.

The  form  of the solution  obtained by the minimax approach
essentially  depends upon the characteristics of a distribution
class.  In many real-life cases, the only information that we have about
the distribution consists of its tolerance interval. 
Thus, it is important  for quality control
applications to
design and implement robust methods 
that correspond to the class of all distributions with given tolerance
intervals.

In this paper, we describe minimax 

\noindent $M-$estimators of a
location parameter and regression parameters designed for this class.
The properties of the new estimators and of 
their  adaptive  versions  are  studied in asymptotics and in a
finite sample size case.

\section{Robust Minimax $M-$Estimators}
Consider the following linear regression model
\begin{equation}
X=\Phi\Theta+E,\end{equation}
where: \begin{itemize}
\item $X = (x_{1},..., x_{n})^T $  is a vector of observed
values; 
\item $\Theta  = (\theta_{1} ,...,\theta_{m})^T $
is a vector of unknown parameters;
\item $\Phi  = {({\phi}_{ij} }),$ $1\le i\le n$, $1\le j\le m$, 
is a given model matrix,
and  
\item $E = (e_{1} ,..., e_{n})^T $
is a vector of (unobservable)  random  errors with  an (unknown) 
density $  f $
belonging to a certain (known) class $\cal F$.
\end{itemize}

The   {\it $ M-$estimate} of a regression parameter $ \Theta$
is  defined
as \cite{1}
\begin{equation}
\hat{\Theta}_{n}=\arg\min_{\Theta}\sum_{i=1}^{n}\rho(x_{i}
-\sum_{j=1}^{m}\theta_{j}\phi_{ij}),
\end{equation}
or
\begin{equation}
\sum_{i=1}^{n}\psi(x_{i}-\sum_{j=1}^{m}\hat{\theta_{j}}\phi_{ij})\phi_{ik}
=0 , \;\;          k=1,...,m,
\end{equation}
where $\rho(u)$  is  a function (called a  {\it loss  function}), and  
$\psi(u) =\rho^{'} (u)$. It is usually required that a function $\psi$
belong to a certain class $ \Psi $.

The idea of the minimax approach is, for a given class $\cal F$, 
to choose an $M-$estimate for which the worst case least square error is
the smallest possible. 
According to \cite{1}, this choice of estimate can be done as follows:
first, we determine the
least favorable density $ f$  for the class $\cal F$ by  minimizing  the
Fisher information $I(f)$  for a location parameter:
\begin{equation}
f^{\ast}  =  \arg\min_{f} I(f),\;\;
I(f) =\int_{-\infty}^{\infty}   (f^{'} / f )^{2} f dx,
\end{equation}
and then design the optimum maximum likelihood  estimate
(MLE) for this density $f^\ast$, i.e., an $M-$estimate with the
following choice of a loss function (and its derivative):
\begin{equation}
\rho^{\ast} (u) = - \log f^{\ast} (u),\;\;
\psi^{\ast}  (u) = -(f^\ast)^\prime(u)/f^\ast (u).
\end{equation}

Under some regularity and convexity  conditions on  the
classes $ \cal F $   and $\Psi$  \cite{2},
the  asymptotic  covariance matrix
$ V (\psi  ,f)$ has the saddle point $(\psi^{\ast},f^{\ast})$
 with the desired minimax property:
\[
V(\psi^{\ast} ,f )\leq  V(\psi^{\ast}   ,f^{\ast} )
\leq  V(\psi  ,f^{\ast} ).
\]
The left part of this inequality is important for practice: it means
that the use of
$\psi = \psi^{\ast}$ provides the guaranteed level of the accuracy of
robust minimax estimates.

The above-mentioned conditions on $f$ include conditions of nonnegativeness,
symmetry, etc.:
\begin{equation}
f(x)\geq   0,\;\;   f(-x) = f(x),\;\;
\int_{-\infty}^{\infty}   f(x)\; dx = 1.
\end{equation}
Symmetry is a rather restrictive condition but it provides the
minimax property of robust estimates and hence is widely used in
robustness research \cite{1}.

Depending  on  additional  restrictions  on  the  class $ \cal F $ ,
different  forms  of  the  density $ f^{\ast}$
and the appropriate loss
functions $\rho^{\ast}$   may result.

It  follows  from  \cite{1} that, for the  class
of approximately finite
densities
\begin{equation}
{\cal F} = \{  f: \int _{-l}^{l} f(x)\;dx =1-\beta  \}\;,\;0\leq \beta \leq 1
\end{equation}
the least favorable density has the exponential tails:
\begin{equation}
f^{\ast}(x;\;A,B,C,D,l) = \left\{
  \begin{array} {ll}
A\cos^{2}(Bx), & \mid x \mid \leq l , \\
C\exp (-D|x|), & \mid x \mid > l ,
  \end{array}
  \right.
\end{equation}
where  the values $A=A(\beta,l)$, $B=B(\beta,l)$, $C=C(\beta,l)$, and 
$D=D(\beta,l)$  are
chosen to  satisfy the following conditions:
\begin{itemize}
\item conditions (6);
\item the condition that $f^\ast\in {\cal F}$; and
\item  the condition that the 
resulting (piecewise defined) function $f^\ast$ is smooth 
at the point $ x = l$.
\end{itemize}
These conditions can be described by the following equations:
$$\int _{-\infty}^{\infty} f^{\ast}(x)\;dx=1\;,\;
\int _{-l}^{l} f^{\ast}(x)\;dx=1-\beta,$$
\begin{equation}
f^{\ast}(l-0)=f^{\ast}(l+0)\;,\;(f^{\ast}(l-0))'=(f^{\ast}(l+0))'\;.
\end{equation}

The exponential tails of the least favorable density (8)
and the corresponding
form of the robust loss function $\rho^{\ast}=|x|$  for $|x|>l$ imply
that the observed data with $|x|\ge l$ are simply ignored (``rejected")
when we apply this method. This rejection is similar to 
the least modules
method \cite{1}. The smaller $l$, the more data is rejected.

The following result can be easily proven.
\begin{theorem}
The least favorable density (4) for the class with inequality constraint
on a tolerance interval
$${\cal F}_{1}=\{f:\; F^{-1}(1-\beta/2)-F^{-1}(\beta/2)\leq b\},$$
\begin{equation}
F(x)=\int_{-\infty}^{x}f(x)\;dx
\end{equation}
is of the form (8) with $l=b/2$.
\end{theorem}

\section{The Least Favorable Density for the Class with Constraints
on Tolerance Intervals}
Consider the class ${\cal F}_{2}$
with
the inequality constraints on tolerance intervals:

$${\cal F}_{2}=\{f:\;F^{-1}(1-\beta_{1}/2)-F^{-1}(\beta_{1}/2) \leq b_{1},$$
\begin{equation}
\;F^{-1}(1-\beta_{2}/2)-F^{-1}(\beta_{2}/2) \leq b_{2} \}\;
\end{equation}
with $0\leq \beta_{2}\leq \beta_{1}\leq 1,\; b_{1}\leq b_{2}$.
The following result is valid in this case.

\begin{theorem}
The least favorable density (4) for the class ${\cal F}_{2}$ is of the form:
\begin{equation}
f^{\ast}(x) = \left \{
              \begin{array} {l}
f^{\ast}(x;\;A_{1},B_{1},C_{1},D_{1},b_1/2) \ {\rm if}\ 
b_{2}/b_1\le k_{1} , \\
f^{\ast}(x;\;A,B,C,D,b)\ {\rm if} \ 
k_{1} < b_{2}/b_{1}\leq k_{2},\\
f^{\ast}(x;\;A_{2},B_{2},C_{2},D_{2},b_2/2) \ {\rm if}\
{b_{2}/b_{1}} > k_{2} ,
               \end{array}
                    \right.
\end{equation}
where:
\begin{itemize}
\item the function $f^{\ast}(x;\;A,B,C,D,b)$ is defined by the
equations (8); 
\item the values of the parameters $A_{1}\div D_1$ are 
determined as $A_1=A(\beta_1,b_1/2)$, $B_1=B(\beta_1,b_1/2)$,
$C_1=C(\beta_1,b_1/2)$, $D_1=D(\beta_1,b_1/2)$; 
\item the values of the parameters $A_{2}\div D_2$ are 
determined as $A_2=A(\beta_2,b_2/2)$, $B_2=B(\beta_2,b_2/2)$,
$C_2=C(\beta_2,b_2/2)$, $D_2=D(\beta_2,b_2/2)$;
\item the values of the 
the parameters $A,B,C,D$ and $b\ (b_{1}/2< b<b_{2}/2)$  are determined
from the equations
$$\int_{-\infty}^{\infty} f^{\ast}(x;\;A,B,C,D,b)\;dx=1,$$
$$\int_{-b_{1}/2}^{b_{1}/2} f^{\ast}(x;\;A,B,C,D,b)\;dx=1-\beta_{1},$$
$$\int_{-b_{2}/2}^{b_{2}/2} f^{\ast}(x;\;A,B,C,D,b)\;dx=1-\beta_{2},$$
$$f^{\ast}(b-0;\;A,B,C,D,b)=f^{\ast}(b+0;\;A,B,C,D,b),$$
$$(f^{\ast})^\prime (b-0;\;A,B,C,D,b)=$$ 
$$(f^{\ast})^\prime (b+0;\;A,B,C,D,b);$$
\item and, finally, the values of the switching parameters 
$k_{1}$ and $k_{2}$ of the solution (12) 
are derived from the equations
\[
\int _{0}^{k_{1}b_{2}/2}
f^{\ast}(x;\;A_{1},B_{1},C_{1},D_{1},{b_1\over 2})
\;dx={1-\beta_{2}\over 2};
\]
\[
\int _{0}^{k_{2}b_{1}/2}
f^{\ast}(x;\;A_{2},B_{2},C_{2},C_{2},{b_2\over 2})
\;dx={1-\beta_{1}\over 2}.
\]
\end{itemize}
\end{theorem}

Various  properties of the  solution  (12) are connected
with the degree in which the constraints are taken into account:
\begin{itemize}
\item 
in the  first  zone $(b_{2}/b_{1}\leq k_{1})$,
only the first restriction 
matters;
\item  in  the  third  zone $(b_{2}/b_{1}>k_{2})$
only  the second restriction
is   essential;
\item   in  the  intermediate  zone,   both
restrictions are used. 
\end{itemize}
From (5) and (12), we can conclude that:
\begin{itemize}
\item for relatively small distribution dispersion
(in the first zone), the ``mild" robust algorithm based on
$f^{\ast}(x;\;A_{1},B_{1},C_{1},D_{1},b_1/2)$ is optimal;
\item for (relatively) large distribution dispersion (in the third zone),
the hard robust algorithm (with the hard rejection of sample elements)
 based on
$f^{\ast}(x;\;A_{2},B_{2},C_{2},D_{2},b_2/2)$ is optimal;
\item in the middle  zone, a compromise  between these
algorithms is the best solution.
\end{itemize}

\section{Robust Adaptive $M -$Estimators}
Considering the problems
of designing  robust estimators  we  have assumed the
availability of apriori information on the characteristics of a
distribution class. However, in practice,
these characteristics are often apriori unknown, and can only be determined
during the  data processing itself. It is therefore necessary to
develop  estimators  which can adapt to the ever
increasing volume of data, and correspondingly correct the characteristics of
the distribution  class.

In the adaptive procedure, the characteristics $b_{1}$ and $b_{2}$
of the class ${\cal F}_{2}$ (defined by equation (11)) 
are estimated and then used in
the robust minimax algorithm processing the same sample $x_{1},...,x_{n}$.
With not very
large  sample  sizes,  such an  approach  is  heuristic and the
simplest for the examination by Monte Carlo technique.

Consider  the  adaptive  algorithm  of  robust estimation of
regression parameters (2), called {\it ARLI - regression}:
\begin{itemize}
\begin{itemize}
\item[(i)]   Choose initial hard robust estimates for $\Theta$   :
       $\hat {\Theta}_{HR} $ from (2) using the least modules method
       with the loss function
\[
       {\rho}(x)=| x | .
\]
\item[(ii)]  Evaluate the errors estimates
\[
  \hat {e}_{i} = x_{i} - \sum _{j=1}^{m}
   \hat {\theta}_{j\,HR} \, \phi_{ij} \, ,\;\; i = 1,...,n.
\]
\item[(iii)] Evaluate the estimates of  the  characteristics of the
              class $ {\cal F}_{2}$:
$$\hat b_{i}=\hat e_{(k_i)}-\hat e_{(l_i)},\ i=1,2$$, 
$$l_i=[\beta_{i}n/2],
k_i=n-l_i.$$
\item [(iv)]  Use  the  robust  minimax  algorithm  (2) with the loss
              function $ \rho^{\ast}= -\log f^{\ast}$ according to
              the formulas (5) and (12).
\end{itemize}
\end{itemize}
The behavior of the ARLI-regression algorithm with  respect
to  finite size samples   ($n = 20\div 100$  with step 10 and $100\div
1000$ with step 100) with $\beta_{1}=0.9,\;\beta _{2}=0.1$
was studied by Monte Carlo technique.  The results of simulation
show high efficiency of the ARLI-algorithm under the mixture
(contamination) models of data distributions \cite{1}.

\begin{thebibliography}{99}

\bibitem{1} 
P. J. Huber, {\bf Robust Statistics} Wiley, New York, 1981.

\bibitem{2} 
      P. J. Huber, ``The behavior of maximum likelihood
         estimates under nonstandard conditions'', {\it Proc. of
         the 5-th Berkeley Symp. on Math. Stat. and Prob.}, 1967, 
         Berkeley Univ. California Press, Vol. 1, pp. 221--223.
\end{thebibliography}

\end{document}

