RobustStats.h
1/***************************************************************************
2 *
3 * Copyright (C) 2011 by Stefan Oslowski
4 * Licensed under the Academic Free License version 2.1
5 *
6 ***************************************************************************/
7
8// This set of methods aims at providing robust, resistant and efficient statistics
9// Resistance means that the methods provided are insensitive to small changes of sample; i.e. are dependent on the locust of the data and ignore outliers
10// Robustness means that the methods provided are insensitive to the assumptions about the intrinsic population
11// Efficient means that the methods provided yield maximum information that can be derived from provided data.
12//
13// For more information, see Understanding Robust and Exploratory Data Analysis by Hoaglin et al, or Robust estimators of scale, Lax 1985
14
15#ifndef __RobustStats_h
16#define __RobustStats_h
17
18#include <cmath>
19#include <vector>
20#include <algorithm>
21#include <iostream>
22
23using namespace std;
24
25// Measures of scale:
26
27// Median absolute deviation
28template<typename T>
29T MAD ( T * data, unsigned size );
30
31
32// f-spread
33template<typename T>
34T f_spread ( T * data, unsigned size );
35
36// f-pseudosigma
37template<typename T>
38T f_pseudosigma ( T * data, unsigned size );
39
40template<typename T>
41T f_pseudosigma ( T f_spread );
42
43// robust std deviation
44
45template<typename T>
46T robust_stddev ( T * data, unsigned size, T initial_guess, T initial_guess_scale = 1.4826, T Tukey_tune = 6.0 );
47
48template<typename T>
49T robust_stddev ( T * data, unsigned size);
50//double robust_stddev ( double * data, unsigned size );
51//
52
53
54// Measures of scale:
55
61template<typename T>
62T MAD ( T * data, unsigned size )
63{
64 vector<T> input;
65 input.resize ( size );
66
67 //copy ( data, data + size, back_inserter ( input ) );
68 std::copy ( data, data + size, input.begin() );
69
70 sort ( input.begin(), input.end() );
71
72 // median of the input data
73 T median = input.at ( size%2==0 ? (unsigned)( (float)size / 2 + 0.5 ) : size / 2 );
74
75 // vector of the absolute deviations from the median
76 vector<T> AD;
77
78 for (unsigned element = 0 ; element < size ; element ++ )
79 AD.push_back ( abs ( input.at( element ) - median ) );
80
81 sort ( AD.begin(), AD.end() );
82
83 // return the median absolute deviation
84 return AD.at ( size%2==0 ? (unsigned)( (float)size / 2 + 0.5 ) : size / 2 );
85}
86
91
92template<typename T>
93T f_spread ( T * data, unsigned size )
94{
95 cerr << "RobustStats::f_spread WARNING f_spread not implemented yet" << endl;
96 return (T)0.0;
97}
98
103template<typename T>
104T f_pseudosigma ( T * data, unsigned size )
105{
106 // 1.349 is the value of f_spread for Gaussian distribution
107 return f_spread ( data, size ) / 1.349;
108}
109
110template<typename T>
111T f_pseudosigma ( T f_spread )
112{
113 // 1.349 is the value of f_spread for Gaussian distribution
114 return f_spread / (T)1.349;
115}
116
117
118//TODO some sources say that the most efficient choice is 9 for the tuning constant. I think this is because some sources use c * MAD, while others use c * MAD * 1.4826
119//TODO IDL implementation has a slighly different estimator. I think they are attempting to get the correct assymptotic value but they are doing it wrong :) Hmm, gives the same result, I guess it's the same estimator just expressed differently.
128template<typename T>
129T robust_stddev ( T * data, unsigned size, T initial_guess, T initial_guess_scale, T Tukey_tune )
130{
131 vector<T> input;
132 input.resize ( size );
133 copy ( data, data + size, input.begin() );
134
135 sort ( input.begin(), input.end() );
136
137 T median = input.at ( size%2==0 ? (unsigned)( (float)size / 2 + 0.5 ) : size / 2 );
138
139 vector<T> biweighted_sq;
140 vector<unsigned> indices;
141 T tmp;
142 // calculate the biweighted values:
143 for ( unsigned i_el = 0; i_el < size ; ++ i_el )
144 {
145 tmp = ( input.at ( i_el ) - median ) / ( Tukey_tune * initial_guess_scale * initial_guess ) ;
146 tmp *= tmp ;
147
148 //cout << i_el << " " << Tukey_tune << " " << initial_guess << " " << initial_guess_scale << endl;
149 if ( tmp < 1.0 )
150 {
151 indices.push_back ( i_el );
152 biweighted_sq.push_back ( tmp );
153 }
154 }
155
156 tmp = 0.0;
157 T std_dev = (T)0.0;
158 for ( unsigned i_el = 0; i_el < biweighted_sq.size() ; ++ i_el )
159 {
160 std_dev += pow ( input.at ( indices.at ( i_el ) ) - median, 2 ) * pow ( 1.0 - biweighted_sq.at( i_el ), 4 ) ;
161 tmp += ( 1.0 - biweighted_sq.at ( i_el ) ) * ( 1.0 - 5.0 * biweighted_sq.at ( i_el ) );
162 }
163
164 tmp = abs ( tmp );
165
166 // eq 4.15 Lax 1985 (robust scale estimators)
167 //cout << " LAX version " << endl;
168 return std_dev = sqrt ( std_dev / ( (T)size - 1.0 ) ) / tmp * (T)size ;
169 // IDL implementation
170 //cout << " IDL version " << endl;
171 //return std_dev = sqrt ( std_dev / tmp / (tmp - 1) * (T)size) ;
172}
173
174/*
175 * @brief convienience wrapper for robust_stddev which assumes that MAD is to be used as the initial guess for scale
176 */
177template<typename T>
178T robust_stddev ( T * data, unsigned size )
179{
180 return robust_stddev ( data, size, MAD ( data, size ) );
181}
182
183#endif
STL namespace.

Generated using doxygen 1.14.0