\documentstyle[IEEEtran]{article}
\begin{document}
\tolerance 10000
\title{On the Choice of an Optimization Criterion under Uncertainty
in Interval Computations - Nonstochastic Approach}

\author{G. L. Shevlyakov and N. O. Vil'chevskiy}

\pagestyle{myheadings}

\markright{APIC'95, El Paso,
Extended Abstracts,
A Supplement to the international journal of {\rm Reliable
Computing}\ \ \ \ \ \ \ \ \ \ \  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
\ \ \ \ \ \ \ \ \ \ \ \ \ }

\maketitle

\auffil{The author is with Department of Mathematics,  
St. Petersburg State Technical University,
St. Petersburg, 195251,  Russia.}

\newtheorem{theorem}{Theorem}

To compute tolerance intervals for location and regression parameters, it is
necessary to first get the approximate estimates of these
characteristic. One of the main statistical approaches to obtaining
these estimates consists of using an optimization criterion, e.g., 
 least squares, least modules, Chebyshev, etc. \cite{1}. From the
viewpoint of estimating tolerance intervals, this optimization
approach has the following additional advantage: not only we can get
the estimate itself by finding the values of the parameters $\Theta$ in which
the criterion $J(\Theta)$ 
attains its minimum, but we can define the tolerance
interval as the set of all the values of 
$\Theta$ for which the value $J(\Theta)$ is sufficiently small (i.e.,
is $\le J_0$ for an appropriately chosen $J_0$). 

In traditional mathematical statistics, 
the choice of an optimization criterion is uniquely
determined by the choice of an appropriate
stochastic model, as in the cases of maximum likelihood,
 robust or nonparametric
approaches \cite{2}. The situation is different, however, 
in nonstochastic approach to data analysis,
when there are no (objective) reasons to use any stochastic model.
This situation occurs, e.g.,
when we are processing unique data samples: this is a typical 
situation with medical data.

In this paper, we show that it is possible 
to reduce an apriori uncertainty connected with the choice of an
optimization criterion, if we impose natural requirements on
M-estimates of location
\cite{3,4}. In some cases, these requirements lead to the unique
choice of the optimization method.

In our considerations, we do not assume any stochastic model
\cite{4,5}.

\section{Univariate Case} 
An {\it M-estimate} of the location is defined as 
\begin{equation}
\hat{\theta}_{n} = \arg \min _{\theta} \sum_{i=1}^{n}
\rho( x_{i} - \theta ),
\end{equation}
where:
\begin{itemize}
\item $x_1,...,x_n$ is a given sample, and
\item $\rho(u)$ is a function called a {\it loss function.}
\end{itemize}

The following result is known for M-estimates:
\smallskip

\noindent{\bf Theorem}\ \cite{3}\ {\it If an M-estimate satisfies the
following three requirements:
\begin{itemize}
\begin{itemize}
\item[(i)] convexity of $\rho(u)$;
\item[(ii)]
central symmetry $$ \hat{\theta}_{n} ( -x_{1}, ..., -x_{n} ) =
 -\hat{\theta}_{n} ( x_{1}, ..., x_{n} );$$
\item[(iii)]equivariance to scale transformations $ \hat{\theta}_{n}
  ( \lambda x_{1}, ..., \lambda x_{n} ) = \lambda \hat{\theta}_{n}
   ( x_{1}, ..., x_{n})$, 
\end{itemize}
\end{itemize}
then this M-estimate coincides with the M-estimate defined by a loss
function 
$\rho(u) = |u|^{p}$ for some \\
$p \ge 1$.} 
\smallskip

\noindent{\it Comment.} In other words, 
we get $L_{p}-$estimates. 

\begin{theorem}
The conditions (i), (ii),  and equivariance to
monotone transformations:
\[
 \hat{\theta}_{n}( f( x_{1} ), ..., f( x_{n} ) )
 = f( \hat{\theta}_{n} ( x_{1}, ..., x_{n} ) ) ,
\]
  where $f( x )$
 is an arbitrary strictly monotone function, yield $L_{1}$-estimate with
 $\rho( u ) = |u|.$
\end{theorem}
\begin{theorem}
 The requirements of differentiability and of minimum norm of the sensibility
 of estimates to small data deviations
\[
\rho^{\ast} = \arg \min _{\rho}
\sum _{i=1}^{n} ( \frac {\partial \hat{\theta}_{n}} {\partial x_{i}})^{2}
\]
yields $L_{2}$-estimate with $\rho^{\ast}( u ) = u^{2}$.
\end{theorem}
\begin{theorem}
 The requirement of recurrency
\[
\hat{\theta}_{n+1} = \hat{\theta}_{n} + \gamma_{n}
 \psi( x_{n+1} - \hat{\theta}_{n} )
\]
yields $L_{2}$-estimate with $\rho( u ) = u^{2}, \; \psi( u ) = u,\;
\gamma_{n} = n^{-1} $.
\end{theorem}

\section{Multivariate Case} 
For multivariate
data $$\; X_{1}, ..., X_{n},\ X_{i} = ( x_{i1}, ..., x_{im} ),$$
{\it M-estimate} of the mean values is defined as
\[
\hat{\Theta}_{n} = \arg \min _{\Theta}
\sum _{i=1}^{n} \rho( x_{i1} - \theta_{1}, ..., x_{im} - \theta_{m} ),
\]
where $\hat{\Theta}_{n} = \hat{\Theta}_{n}( X_{1}, ..., X_{n} )
= ( \hat{\theta}_{1}, ..., \hat{\theta}_{m} )$  is a vector of
estimates, and 
$\rho( u_{1}, ..., u_{m} )$ is a function of $m$ variables called a 
{\it loss function}.

In the multivariate case, the requirements of equivariance to
scale transformations can be formulated in several different ways:

\begin{theorem}
If an M-estimate satisfies the following requirements:
\begin{itemize}
\item equivariance to 
independent scale transformations of vector components:
$$\hat{\Theta}_{n}( \tilde{X}_{1}, ..., \tilde{X}_{n} ) =
( \lambda_{1} \hat{\theta}_{1}, ..., \lambda_{m} \hat{\theta}_{m} ),$$
$$\tilde{X}_{i} = ( \lambda_{1} x_{i1}, ..., \lambda_{m} x_{im} );$$
\item convexity of $\rho$; 
\item central symmetry of vector components $$\hat{\theta}_{j}( -x_{1j},
..., -x_{nj} ) = -\hat{\theta}_{j}( x_{1j}, ..., x_{nj} ),$$
$$j = 1, ...,  m;$$
\item piece-wise differentiability of $\psi_{j} = \partial \rho /
 \partial u_{j},\;j = 1, ..., m$;
\item boundary conditions
$$\rho( u_{1}, ..., u_{l-1}, 0, u_{l+1}, ..., u_{m} ) =
 \sum _{j \neq l}^{m} A_{j} |u_{j}|^{p_{j}},$$
 \begin{equation}A_{j} > 0, \;
  p_{j} \geq 1,\; l = 1, ..., m
  \end{equation}
\end{itemize}
then 
  \[
  \rho( u_{1}, ..., u_{m} ) =
  \sum _{j=1}^{m} A_{j}|u_{j}|^{p_{j}},\; A_{j} > 0,\; p_{j} \geq 1.
  \]
\end{theorem}
\begin{theorem}
The equivariance of M-estimate to the 
scale transformation
\begin{equation}
\hat{\Theta}_{n}( \lambda X_{1}, ..., \lambda X_{n} ) =
\lambda \hat{\Theta}_{n}( X_{1}, ..., X_{n} )
\end{equation}
implies that the loss function is homogeneous:
\[
\rho( \lambda u_{1}, ..., \lambda u_{m} ) =
|\lambda|^{p} \rho( u_{1}, ..., u_{m} ).
\]
\end{theorem}
\begin{theorem}
 The equivariance of M-estimates to arbitrary
orthogonal transformations of data
$$\tilde{X}_{i} = O_{m} X_{i},\;i = 1, ..., n;$$
\begin{equation}
\hat{\Theta}_{n}( \tilde{X}_{1}, ..., \tilde{X}_{n} )
 = O_{m} \hat{\Theta}_{n}( X_{1}, ..., X_{n} ),
 \end{equation}
 where $O_{m} $ is an orthogonal matrix of dimension  $m$,
yields the loss functions of the form
\[
\rho( u_{1}, ..., u_{m} ) = F ( \sum_{j=1}^{m} u_{j}^{2} );
\]
where $F = F(v^{2})$ is an arbitrary convex function.
\end{theorem}
\begin{theorem}
If an M-estimate satisfies either the condition (4) and one of the
conditions (2) or (3), then it is 
a multivariate $L_{p}$-optimization criterion
$$\rho(u_{1}, ..., u_{m} ) = A r^{p},\; A > 0,\;p\geq1,$$
\begin{equation}
r = ( \sum_{j=1}^{m}u_{j}^{2} )^{1/2}.
\end{equation}
\end{theorem}
\bigskip

Another characterization of criterion (5)
was obtained in \cite{6}:

\begin{theorem}
 The equivariance of M-estimates to arbitrary affine
transformations  yields
\[
\rho( u_{1}, ..., u_{m} ) = \sum_{j=1}^{m} u_{j}^{2}.
\]
\end{theorem}

\begin{thebibliography}{99}

\bibitem{1} J. R. Rice, {\bf The Approximation of Functions. Linear
Theory. Vol. 1,} Addison Wesley, New York, 1964.

\bibitem{2} J. W. Tukey, ``The Future of Data Analysis'',
{\it Ann. Math. Statist.}, 1967, Vol. 33, No. 1, pp. 1--67.

\bibitem{3} P. J. Bickel and E. L. Lehmann, ``Descriptive Statistics
for Nonparametric Models. 1. Introduction'', {\it  Ann. Statist.},
1975, Vol. 3, pp. 1045--1069.

\bibitem{4} V. Ya. Kreinovich, ``A General Approach
to Analysis of Uncertainty in Measurements'', {\it
Proceedings of the
the 3-rd USSR National Symposium on Theoretical Metrology}, Leningrad,
Mendeleev Metrology Institute (VNIIM), 1986, 
pp. 187--188 (in Russian).

\bibitem{5} G. L. Shevlyakov and  N. O. Vil'chevskiy,
``Axiomatic Approach
to the Choice
of an Optimization Criterion," {\it Proceedings of 
the 6-th Seminar on Nonparametric
and Robust Methods in Cybernetics}, Tomsk University Press, Tomsk, 1987,
Part 1, pp. 93--97 (in Russian).

\bibitem{6} W. Gehrig and K. Hellwig, ``Eine Charakterisierung
der gewichteten $L_{r}$-Distanz", In: {\bf OR-Spektrum}, Springer
Verlag, 
Berlin - Heidelberg - New York, 1982, pp. 233--237.
\end{thebibliography}


\end{document}