Create test table
create table outlier_demo (a number, val number)
Table created.
Populate table
insert into outlier_demo
select rownum -- we don't care
, round(dbms_random.value(20,50),1) -- some random numbers between 20 and 50
from dual
-- let's create 49 rows
connect by level < 50
Statement processed.
Add outlier
insert into outlier_demo values (0,500)
1 row(s) inserted.
Check sample data
select * from outlier_demo
A | VAL |
---|---|
1 | 27.9 |
2 | 42.5 |
3 | 49.7 |
4 | 22.3 |
5 | 42.3 |
6 | 20.2 |
7 | 33.9 |
8 | 46.8 |
9 | 34 |
10 | 26.6 |
11 | 29.9 |
12 | 37 |
13 | 28.1 |
14 | 34.6 |
15 | 35.7 |
16 | 45.2 |
17 | 24.1 |
18 | 35.4 |
19 | 28.4 |
20 | 42.2 |
21 | 33 |
22 | 30 |
23 | 47 |
24 | 32.8 |
25 | 24 |
26 | 48.6 |
27 | 47.4 |
28 | 41.9 |
29 | 47.4 |
30 | 46.2 |
31 | 25.9 |
32 | 40.2 |
33 | 38.7 |
34 | 42 |
35 | 35.1 |
36 | 35.7 |
37 | 46.4 |
38 | 47.2 |
39 | 28.5 |
40 | 49.3 |
41 | 29.2 |
42 | 45.4 |
43 | 35.3 |
44 | 27.3 |
45 | 33.4 |
46 | 49.3 |
47 | 47.2 |
48 | 45.7 |
49 | 21.8 |
0 | 500 |
Calculate averages
select count(*) c
-- what's our average?
,round(avg(val)) the_avg
-- what would the average be without the known outlier?
,round(avg(case when val < 500 then val end)) no_outlier
,round(stddev(val)) std
-- what's the median
,round(median(val)) the_median
from outlier_demo
C | THE_AVG | NO_OUTLIER | STD | THE_MEDIAN |
---|---|---|---|---|
50 | 46 | 37 | 66 | 36 |
Go Analytical!
select a, val
-- what is 2 standard deviations away
-- provide result for each row
,round(2*stddev(val) over (order by null)) local_std
from outlier_demo
A | VAL | LOCAL_STD |
---|---|---|
1 | 27.9 | 132 |
2 | 42.5 | 132 |
3 | 49.7 | 132 |
4 | 22.3 | 132 |
5 | 42.3 | 132 |
6 | 20.2 | 132 |
7 | 33.9 | 132 |
8 | 46.8 | 132 |
9 | 34 | 132 |
10 | 26.6 | 132 |
11 | 29.9 | 132 |
12 | 37 | 132 |
13 | 28.1 | 132 |
14 | 34.6 | 132 |
15 | 35.7 | 132 |
16 | 45.2 | 132 |
17 | 24.1 | 132 |
18 | 35.4 | 132 |
19 | 28.4 | 132 |
20 | 42.2 | 132 |
21 | 33 | 132 |
22 | 30 | 132 |
23 | 47 | 132 |
24 | 32.8 | 132 |
25 | 24 | 132 |
26 | 48.6 | 132 |
27 | 47.4 | 132 |
28 | 41.9 | 132 |
29 | 47.4 | 132 |
30 | 46.2 | 132 |
31 | 25.9 | 132 |
32 | 40.2 | 132 |
33 | 38.7 | 132 |
34 | 42 | 132 |
35 | 35.1 | 132 |
36 | 35.7 | 132 |
37 | 46.4 | 132 |
38 | 47.2 | 132 |
39 | 28.5 | 132 |
40 | 49.3 | 132 |
41 | 29.2 | 132 |
42 | 45.4 | 132 |
43 | 35.3 | 132 |
44 | 27.3 | 132 |
45 | 33.4 | 132 |
46 | 49.3 | 132 |
47 | 47.2 | 132 |
48 | 45.7 | 132 |
49 | 21.8 | 132 |
0 | 500 | 132 |
The Solution
select count(*) c
,round(avg(val)) the_avg
,round(avg(case when val < 500 then val end)) no_outlier
,round(stddev(val)) new_stddev
,round(avg(local_std)) filter
from
(select a, val
-- what is 2 standard deviations away
-- provide result for each row
,round(2*stddev(val) over (order by null)) local_std
from outlier_demo
)
-- only average those values within two standard deviations
where val < local_std
C | THE_AVG | NO_OUTLIER | NEW_STDDEV | FILTER |
---|---|---|---|---|
49 | 37 | 37 | 9 | 132 |