Create test table
create table outlier_demo (a number, val number)
Table created.
Populate table
insert into outlier_demo
select rownum -- we don't care
, round(dbms_random.value(20,50),1) -- some random numbers between 20 and 50
from dual
-- let's create 49 rows
connect by level < 50
Statement processed.
Add outlier
insert into outlier_demo values (0,500)
1 row(s) inserted.
Check sample data
select * from outlier_demo
A | VAL | 1 | 27.9 | 2 | 42.5 | 3 | 49.7 | 4 | 22.3 | 5 | 42.3 | 6 | 20.2 | 7 | 33.9 | 8 | 46.8 | 9 | 34 | 10 | 26.6 | 11 | 29.9 | 12 | 37 | 13 | 28.1 | 14 | 34.6 | 15 | 35.7 | 16 | 45.2 | 17 | 24.1 | 18 | 35.4 | 19 | 28.4 | 20 | 42.2 | 21 | 33 | 22 | 30 | 23 | 47 | 24 | 32.8 | 25 | 24 | 26 | 48.6 | 27 | 47.4 | 28 | 41.9 | 29 | 47.4 | 30 | 46.2 | 31 | 25.9 | 32 | 40.2 | 33 | 38.7 | 34 | 42 | 35 | 35.1 | 36 | 35.7 | 37 | 46.4 | 38 | 47.2 | 39 | 28.5 | 40 | 49.3 | 41 | 29.2 | 42 | 45.4 | 43 | 35.3 | 44 | 27.3 | 45 | 33.4 | 46 | 49.3 | 47 | 47.2 | 48 | 45.7 | 49 | 21.8 | 0 | 500 |
---|
Calculate averages
select count(*) c
-- what's our average?
,round(avg(val)) the_avg
-- what would the average be without the known outlier?
,round(avg(case when val < 500 then val end)) no_outlier
,round(stddev(val)) std
-- what's the median
,round(median(val)) the_median
from outlier_demo
C | THE_AVG | NO_OUTLIER | STD | THE_MEDIAN | 50 | 46 | 37 | 66 | 36 |
---|
Go Analytical!
select a, val
-- what is 2 standard deviations away
-- provide result for each row
,round(2*stddev(val) over (order by null)) local_std
from outlier_demo
A | VAL | LOCAL_STD | 1 | 27.9 | 132 | 2 | 42.5 | 132 | 3 | 49.7 | 132 | 4 | 22.3 | 132 | 5 | 42.3 | 132 | 6 | 20.2 | 132 | 7 | 33.9 | 132 | 8 | 46.8 | 132 | 9 | 34 | 132 | 10 | 26.6 | 132 | 11 | 29.9 | 132 | 12 | 37 | 132 | 13 | 28.1 | 132 | 14 | 34.6 | 132 | 15 | 35.7 | 132 | 16 | 45.2 | 132 | 17 | 24.1 | 132 | 18 | 35.4 | 132 | 19 | 28.4 | 132 | 20 | 42.2 | 132 | 21 | 33 | 132 | 22 | 30 | 132 | 23 | 47 | 132 | 24 | 32.8 | 132 | 25 | 24 | 132 | 26 | 48.6 | 132 | 27 | 47.4 | 132 | 28 | 41.9 | 132 | 29 | 47.4 | 132 | 30 | 46.2 | 132 | 31 | 25.9 | 132 | 32 | 40.2 | 132 | 33 | 38.7 | 132 | 34 | 42 | 132 | 35 | 35.1 | 132 | 36 | 35.7 | 132 | 37 | 46.4 | 132 | 38 | 47.2 | 132 | 39 | 28.5 | 132 | 40 | 49.3 | 132 | 41 | 29.2 | 132 | 42 | 45.4 | 132 | 43 | 35.3 | 132 | 44 | 27.3 | 132 | 45 | 33.4 | 132 | 46 | 49.3 | 132 | 47 | 47.2 | 132 | 48 | 45.7 | 132 | 49 | 21.8 | 132 | 0 | 500 | 132 |
---|
The Solution
select count(*) c
,round(avg(val)) the_avg
,round(avg(case when val < 500 then val end)) no_outlier
,round(stddev(val)) new_stddev
,round(avg(local_std)) filter
from
(select a, val
-- what is 2 standard deviations away
-- provide result for each row
,round(2*stddev(val) over (order by null)) local_std
from outlier_demo
)
-- only average those values within two standard deviations
where val < local_std
C | THE_AVG | NO_OUTLIER | NEW_STDDEV | FILTER | 49 | 37 | 37 | 9 | 132 |
---|