# Real World Problem Solving with SQL

• Tutorial Real World Problem Solving with SQL
• Description Examples of how to use SQL to solve real problems, as discussed in the database@home event. https://asktom.oracle.com/pls/apex/asktom.search?oh=8141
• Tags match_recognize, sum, analytic functions
• Area SQL Analytics
• Contributor Chris Saxon (Oracle)
• Created Tuesday May 05, 2020
• Modules 14
• ## Prerequisite SQL

``````create table fw_store (
shopid      varchar2(10) primary key,
containers  integer
)
/

begin
insert into fw_store values ('AALBORG'  , 4);
insert into fw_store values ('GLOSTRUP' , 4);
insert into fw_store values ('HADERSLEV', 3);
commit;
end;
/

create table fw_daybudget (
shopid      varchar2(10) references fw_store (shopid),
budgetdate  date,
budgetnem   number
)
/

begin
insert into fw_daybudget values ('AALBORG'  , date '2011-12-27', 150);
insert into fw_daybudget values ('AALBORG'  , date '2011-12-28', 200);
insert into fw_daybudget values ('AALBORG'  , date '2011-12-29', 300);
insert into fw_daybudget values ('AALBORG'  , date '2011-12-30', 500);
insert into fw_daybudget values ('AALBORG'  , date '2011-12-31', 400);
insert into fw_daybudget values ('GLOSTRUP' , date '2011-12-27', 150);
insert into fw_daybudget values ('GLOSTRUP' , date '2011-12-28', 200);
insert into fw_daybudget values ('GLOSTRUP' , date '2011-12-29', 300);
insert into fw_daybudget values ('GLOSTRUP' , date '2011-12-30', 500);
insert into fw_daybudget values ('GLOSTRUP' , date '2011-12-31', 400);
insert into fw_daybudget values ('HADERSLEV', date '2011-12-27', 100);
insert into fw_daybudget values ('HADERSLEV', date '2011-12-28', 150);
insert into fw_daybudget values ('HADERSLEV', date '2011-12-29', 200);
insert into fw_daybudget values ('HADERSLEV', date '2011-12-30', 400);
insert into fw_daybudget values ('HADERSLEV', date '2011-12-31', 300);
commit;
end;
/

create table fw_hourbudget (
hour        integer,
percent     number
)
/

begin
insert into fw_hourbudget values ( 9,  4);
insert into fw_hourbudget values (10,  8);
insert into fw_hourbudget values (11, 10);
insert into fw_hourbudget values (12, 12);
insert into fw_hourbudget values (13, 12);
insert into fw_hourbudget values (14, 12);
insert into fw_hourbudget values (15, 14);
insert into fw_hourbudget values (16, 14);
insert into fw_hourbudget values (17, 10);
insert into fw_hourbudget values (18,  4);
commit;
end;
/

create table fw_sales (
shopid      varchar2(10) references fw_store (shopid),
saleshour   date,
salesnem    number
)
/

begin
insert into fw_sales
select shopid
, day + numtodsinterval(hour,'hour') saleshour
, salesnem
from (
select 'AALBORG' shopid, date '2011-12-27' day,  4 h9,  6 h10,  5 h11, 20 h12, 19 h13, 22 h14, 27 h15, 11 h16, 16 h17,  4 h18 from dual union all
select 'AALBORG'  , date '2011-12-28',  7, 17, 18, 13, 27, 28, 20, 14, 10, 19 from dual union all
select 'AALBORG'  , date '2011-12-29', 10, 14, 20, null, null, null, null, null, null, null from dual union all
select 'GLOSTRUP' , date '2011-12-27',  1,  6,  6, 14, 17, 17, 13, 15,  7,  7 from dual union all
select 'GLOSTRUP' , date '2011-12-28',  4, 14, 30, 35, 22, 21, 35, 34, 15, 25 from dual union all
select 'GLOSTRUP' , date '2011-12-29',  6, 13, 50, null, null, null, null, null, null, null from dual union all
select 'HADERSLEV', date '2011-12-27',  4,  7, 13, 15, 17, 13, 18, 19, 10,  3 from dual union all
select 'HADERSLEV', date '2011-12-28',  8,  5, 14, 18, 20, 18, 15, 24, 12,  1 from dual union all
select 'HADERSLEV', date '2011-12-29',  1, 19, 33, null, null, null, null, null, null, null from dual
) s1
unpivot exclude nulls (
salesnem for hour in (
h9  as  9,
h10 as 10,
h11 as 11,
h12 as 12,
h13 as 13,
h14 as 14,
h15 as 15,
h16 as 16,
h17 as 17,
h18 as 18
)
);
commit;
end;
/

create table inventory (
item  varchar2(10)   -- identification of the item
, loc   varchar2(10)   -- identification of the location
, qty   number         -- quantity present at that location
, purch date           -- date that quantity was purchased
);

insert into inventory values('Ale' , '1-A-20', 18, DATE '2014-02-01');
insert into inventory values('Ale' , '1-A-31', 12, DATE '2014-02-05');
insert into inventory values('Ale' , '1-C-05', 18, DATE '2014-02-03');
insert into inventory values('Ale' , '2-A-02', 24, DATE '2014-02-02');
insert into inventory values('Ale' , '2-D-07',  9, DATE '2014-02-04');
insert into inventory values('Bock', '1-A-02', 18, DATE '2014-02-06');
insert into inventory values('Bock', '1-B-11',  4, DATE '2014-02-05');
insert into inventory values('Bock', '1-C-04', 12, DATE '2014-02-03');
insert into inventory values('Bock', '1-B-15',  2, DATE '2014-02-02');
insert into inventory values('Bock', '2-D-23',  1, DATE '2014-02-04');
commit;

create table orderline (
ordno number         -- id-number of the order
, item  varchar2(10)   -- identification of the item
, qty   number         -- quantity ordered
);

insert into orderline values (42, 'Ale' , 24);
insert into orderline values (42, 'Bock', 18);
commit;

create table running_log (
run_date       date not null,
time_in_s      int  not null,
distance_in_km int  not null
)
;

-- A sample data set of running training logs.
begin
insert into running_log values (date'2020-04-01', 310, 1);
insert into running_log values (date'2020-04-02', 1600, 5);
insert into running_log values (date'2020-04-03', 3580, 11);
insert into running_log values (date'2020-04-06', 1550, 5);
insert into running_log values (date'2020-04-07', 300, 1);
insert into running_log values (date'2020-04-10', 280, 1);
insert into running_log values (date'2020-04-13', 1530, 5);
insert into running_log values (date'2020-04-14', 295, 1);
insert into running_log values (date'2020-04-15', 292, 1);
commit;
end;
/

create table emp as
select * from scott.emp;``````
• Module 1

## Predicting Stock Shortages

To start we'll build a prediction engine, estimating when shops will run out of fireworks to sell. This will use data from tables storing the shops, order received, and daily and hourly sales predictions:

``````select * from fw_store;
select * from fw_sales;
select * from fw_daybudget;
select * from fw_hourbudget;
``````

• Module 2

## Calculating Running Totals

By adding the OVER clause to SUM, you can calculate running totals. This has three clauses:

• Partition by - split the data set up into separate groups
• Order by - sort the data, defining the order totals are calculated
• Window clause - which rows to include in the total

This returns the cumulative sales for each shop by date:

``````alter session set nls_date_format='YYYY-MM-DD HH24:MI:SS';

select sales.shopid
, sales.salesnem
, sales.saleshour
, sum(sales.salesnem) over (
partition by sales.shopid
order by sales.saleshour
rows between unbounded preceding and current row
) salesnemacc
from fw_sales sales
order by sales.shopid, sales.saleshour;
``````

The clauses in this SUM work as follows:

• `partition by sales.shopid`
calculates the running total for each shop separately
• `order by sales.saleshour`
sorts the rows by hour, so this gives the cumulative sales by date
• `rows between unbounded preceding and current row`
sums the sales for this row and all previous rows
• Module 3

## Creating Hourly Sales Predictions

The budget figures are daily. To combine these with hourly sales, we need to convert them to hourly figures.

This query does this by cross joining the day and hour budget tables. Then multiplying the daily budget by the hourly percentage to give the expected sales for that hour:

``````alter session set nls_date_format='YYYY-MM-DD HH24:MI:SS';

with shop as (
select s.shopid
, s.containers * 250 startnem
from fw_store s
), budget as (
select db.shopid
, db.budgetdate + numtodsinterval(hb.hour,'hour') budgethour
, db.budgetnem * hb.percent / 100 budgetnem
from   fw_daybudget db
cross  join fw_hourbudget hb
)
select budget.shopid
, shop.startnem
, budget.budgethour hour
, budget.budgetnem qtynem
from   shop
join   budget
on     budget.shopid = shop.shopid;
``````

• Module 4

## Finding When Predicted Sales Exceed Stock Level

We can now combine the hourly actual and predicted sales figures. We want to include actual figures up to the time of the last recorded sale. After this, the query should use the expected figures.

This query returns the real or projected sales figures. Then computes the running total of sales of this combined figure:

``````alter session set nls_date_format='YYYY-MM-DD HH24:MI:SS';

with shop as (
select s.shopid
, s.containers * 250 startnem
from   fw_store s
), budget as (
select db.shopid
, db.budgetdate + numtodsinterval(hb.hour,'hour') budgethour
, db.budgetnem * hb.percent / 100 budgetnem
from   fw_daybudget db
cross  join fw_hourbudget hb
), nem as (
select budget.shopid
, shop.startnem
, budget.budgethour hour
, case
when budget.budgethour < to_date('2011-12-29 12:00:00','YYYY-MM-DD HH24:MI:SS')
then 'S'
else 'B'
end salesbudget
, case
when budget.budgethour < to_date('2011-12-29 12:00:00','YYYY-MM-DD HH24:MI:SS')
then nvl(sales.salesnem,0)
else budget.budgetnem
end qtynem
from   shop
join   budget
on     budget.shopid = shop.shopid
left outer join fw_sales sales
on     sales.shopid = budget.shopid
and    sales.saleshour = budget.budgethour
)
select nem.shopid
, nem.hour
, nem.salesbudget
, nem.qtynem
, sum(nem.qtynem) over (
partition by nem.shopid
order by nem.hour
rows between unbounded preceding and current row
) sumnem
, greatest(nem.startnem - nvl(
sum(nem.qtynem) over (
partition by nem.shopid
order by nem.hour
rows between unbounded preceding and 1 preceding
)
,0),0) stocknem
from nem
order by shopid, hour;``````

It also predicts the remaining stock at each time by subtracting the running total of hourly sales (real or predicted) from the starting stock. Notice that this has this window clause:

`rows between unbounded preceding and 1 preceding`

This means include all the previous rows and exclude the current row. This is because we want to return the expected stock at the start of the hour. Including the current row returns the expected level at the end of the hour.

• Module 5

## Predicting Exact Time of Zero Stock

The previous query returned every hour and the expected stock level. To find the time stock is expected to run out, find the last date where the remaining stock is greater than zero:

``````alter session set nls_date_format='YYYY-MM-DD HH24:MI:SS';

with shop as (
select s.shopid
, s.containers * 250 startnem
from fw_store s
), budget as (
select db.shopid
, db.budgetdate + numtodsinterval(hb.hour,'hour') budgethour
, db.budgetnem * hb.percent / 100 budgetnem
from fw_daybudget db
cross join fw_hourbudget hb
), nem as (
select budget.shopid
, shop.startnem
, budget.budgethour hour
, case
when budget.budgethour < to_date('2011-12-29 12:00:00','YYYY-MM-DD HH24:MI:SS')
then 'S'
else 'B'
end salesbudget
, case
when budget.budgethour < to_date('2011-12-29 12:00:00','YYYY-MM-DD HH24:MI:SS')
then nvl(sales.salesnem,0)
else budget.budgetnem
end qtynem
from   shop
join   budget
on     budget.shopid = shop.shopid
left outer join fw_sales sales
on     sales.shopid = budget.shopid
and    sales.saleshour = budget.budgethour
)
select shopid
, max(hour)
+ numtodsinterval(
max(stocknem) keep (dense_rank last order by hour)
/ max(qtynem) keep (dense_rank last order by hour)
,'hour'
) zerohour
from (
select nem.shopid
, nem.hour
, nem.salesbudget
, nem.qtynem
, sum(nem.qtynem) over (
partition by nem.shopid
order by nem.hour
rows between unbounded preceding and current row
) sumnem
, greatest(nem.startnem - nvl(
sum(nem.qtynem) over (
partition by nem.shopid
order by nem.hour
rows between unbounded preceding and 1 preceding
)
,0),0) stocknem
from nem
)
where stocknem > 0
group by shopid
order by shopid;``````

To estimate the exact time stock will run out, we can take the ratio of expected sales to remaining stock for the hour it's due to run out. The query does this with these functions:

```     , max(hour)
+ numtodsinterval(
max(stocknem) keep (dense_rank last order by hour)
/ max(qtynem) keep (dense_rank last order by hour)
,'hour'
) zerohour```

This uses the KEEP clause to return the value for stock and quantity for the maximum HOUR. This is necessary because STOCKNEM & QTYNUM are not in the GROUP BY or in an aggregate function.

This technique of calculating the previous running total up to some limit has many other applications. These include:

• Stock picking algorithms
• Calculating SLA breach times for support tickets

We'll look at how you can use this to do stock picking algorithms next

• Module 6

## Stock Picking Routines

Next we'll look at how to use SQL to find which stock location to get inventory from to fulfil orders. This will use these tables:

``````select * from orderline;
select * from inventory;``````

The algorithm needs to list all the locations stock pickers need to choose from. There must be enough locations for the sum of quantities in stock reaches the ordered quantity of that product.

• Module 7

## Get the Cumulative Quantity of Stock Picked

The algorithm needs to keep selecting locations until the total quantity picked is greater than the ordered quantity. As with the previous problem, this is possible with a running SUM.

This query gets the cumulative selected quantity for the current row and the previous quantity:

``````select o.item
, o.qty ord_qty
, i.loc
, i.purch
, i.qty loc_qty
, sum(i.qty) over (
partition by i.item
order by i.purch, i.loc
rows between unbounded preceding and current row
) sum_qty
, sum(i.qty) over (
partition by i.item
order by i.purch, i.loc
rows between unbounded preceding and 1 preceding
) sum_prv_qty
from   orderline o
join   inventory i
on     i.item  = o.item
where  o.ordno = 42
order by o.item, i.purch, i.loc;``````

To filter this to those locations needed to fulfil the order, we need all the rows where the previous picked quantity is less than the ordered quantity.

• Module 8

## Try Different Stock Picking Algorithms

Like the previous problem, the SQL needs to keep adding rows to the result until the running total for the previous row is greater than the ordered quantity.

This starts the SQL to find all the needed locations. You can implement different stock picking algorithms by changing the ORDER BY for the running total. Experiment by replacing /* TODO */ with different columns to see what effect this has on the locations chosen:

``````select s.*
, least(s.loc_qty, s.ord_qty - s.sum_prv_qty) pick_qty
from (
select o.item
, o.qty ord_qty
, i.loc
, i.purch
, i.qty loc_qty
, nvl(sum(i.qty) over (
partition by i.item
order by /* TODO */
rows between unbounded preceding and 1 preceding
),0) sum_prv_qty
from  orderline o
join  inventory i
on    i.item  = o.item
where o.ordno = 42
) s
where s.sum_prv_qty < s.ord_qty
order by s.item, s.purch, s.loc;``````

• Module 9

## Row Numbering Methods

Whichever algorithm you decide to use to select stock, it may choose a poor route for the stock picker to walk around the warehouse. This can lead to them walking back down an aisle, when it would be better to continue up to the top of the aisle. Then walk back down the next one.

One way to do this is to number the aisles needed. Then walk up the odds and back down the evens. This means locations in the same aisle need the same aisle number. We want to assign these numbers!

Oracle Database has three row numbering functions:

• Rank - An Olympic ranking system. Rows with same sort key have the same rank. After ties there is a gap in the ranks. After ties the numbering starts from the row's position in the results.
• Dense_Rank - Like RANK, this sets the rank to be the same for rows with the same sort key. But this has no gaps in the sequence
• Row_Number - This gives unique consecutive values

This compares the different ranking functions:

``````with inv_locations as (
select s.loc
, to_number(substr(s.loc,1,1)) warehouse
, substr(s.loc,3,1) aisle
, to_number(substr(s.loc,5,2)) position
from   inventory s
)
select loc
, warehouse
, aisle
, row_number () over (
order by warehouse, aisle
) rn
, rank () over (
order by warehouse, aisle
) rk
, dense_rank () over (
order by warehouse, aisle
) dr
from   inv_locations;``````

• Module 10

## Change Stock Picking Route

To improve the routing algorithm, we want to give locations row numbers. Locations on the same aisle must have the same rank. We can then alternate the route up and down the aisles by sorting:

• By ascending position for odd aisles
• By descending position for even aisles

To do this locations in the same aisle must have the same rank. And there must be no gaps in the ranks. Thus DENSE_RANK is the correct function to use here. This sorts by warehouse number, then aisle:

```      , dense_rank() over (
order by to_number(substr(s.loc,1,1))     -- warehouse
, substr(s.loc,3,1)                -- aisle
) aisle_no```

To alternate ascending/descending sorts, take this rank modulus two. Return the position when it's one and negate the position when it's zero:

```      , case
when mod(s2.aisle_no,2) = 1 then s2.position
else                            -s2.position
end;```

The complete query for this is:

``````select s2.warehouse, s2.aisle, s2.aisle_no, s2.position
, s2.loc, s2.item, s2.pick_qty
from (
select to_number(substr(s.loc,1,1)) warehouse
, substr(s.loc,3,1) aisle
, dense_rank() over (
order by to_number(substr(s.loc,1,1))     -- warehouse
, substr(s.loc,3,1)                -- aisle
) aisle_no
, to_number(substr(s.loc,5,2)) position
, s.loc, s.item
, least(s.loc_qty, s.ord_qty - s.sum_prv_qty) pick_qty
from (
select o.item, o.qty ord_qty, i.loc, i.purch, i.qty loc_qty
, nvl(sum(i.qty) over (
partition by i.item
order by i.qty, i.loc
rows between unbounded preceding and 1 preceding
),0) sum_prv_qty
from   orderline o
join   inventory i
on     i.item  = o.item
where  o.ordno = 42
) s
where s.sum_prv_qty < s.ord_qty
) s2
order by s2.warehouse
, s2.aisle_no
, case
when mod(s2.aisle_no,2) = 1 then s2.position
else                            -s2.position
end;``````

• Module 11

## Finding Consecutive Rows - Tabibitosan

For the third problem we're searching for consecutive dates in this running log:

``select * from running_log;``

The goal is to split these rows into groups of consecutive dates. For each group, return the start date and number of days in it.

There's a trick you can use to do this:

1. Assign unique, consecutive numbers sorted by date to each row
2. Subtract this row number from the date

After applying this method, consecutive dates will have the same value:

``````with grps as (
select run_date
, row_number () over ( order by run_date ) rn
, run_date - row_number () over ( order by run_date ) grp
from   running_log
)
select *
from   grps
order  by run_date;``````

You can then summarise these data by grouping by the expression above and returning min, max, and counts to find start, end, and numbers of rows:

``````with grps as (
select run_date, row_number () over ( order by run_date ) rn ,
run_date - row_number () over ( order by run_date ) grp
from   running_log
)
select min ( run_date ) first_run, count(*) runs
from   grps
group  by grp
order  by first_run;``````

This technique is referred to at the Tabibitosan method.

• Module 12

## Finding Consecutive Rows - Pattern Matching

Added in Oracle Database 12c, the row pattern matching clause, MATCH_RECOGNIZE, offers another way to solve this problem.

To do this, you need to define pattern variables: criteria for rows to meet. Then create a regular expression using these variables.

To find consecutive dates, you need to look for rows where the current date equals the previous date plus one. This pattern variable does this:

`consecutive as run_date = prev ( run_date ) + 1`

To search for a series of consecutive rows, use this pattern:

`pattern ( init consecutive* ) `

This matches one instance of INIT followed by any number of CONSECUTIVE.

But what's this INIT variable? It has no definition!

Undefined variables are "always true". This matches any row. This enables the pattern to match the first row in the data set. Without this CONSECUTIVE will always be false, because the previous RUN_DATE will always be null.

This query returns the first date and number of rows in each group:

``````select *
from   running_log
match_recognize (
order by run_date
measures
first ( run_date ) as start_date,
count (*) as days
pattern ( init consecutive* )
define
consecutive as run_date = prev ( run_date ) + 1
);``````

By default MATCH_RECOGNIZE returns one row per group. To make it easier to see what's going on, this query adds the clause:

`all rows per match`

This returns all the matched rows. In the MEASURES clause it also adds the CLASSIFIER function. This returns the name of the pattern variable this row matches:

``````select *
from   running_log
match_recognize (
order by run_date
measures
classifier () as var,
first ( run_date ) as start_date,
count (*) as days
all rows per match
pattern ( init consecutive* )
define
consecutive as run_date = prev ( run_date ) + 1
);``````

• Module 13

## Counting Number of Child Nodes in a Tree

To finish, we'll build an organization tree using the classic EMP table:

``````select empno
, lpad(' ', (level-1)*2) || ename as ename
from   emp
connect by mgr = prior empno
order siblings by empno;``````

We want to augment this by adding the total number of reports each person has. I.e. count the number of nodes in the tree below this one.

You can do this with the following query:

``````select empno
, lpad(' ', (level-1)*2) || ename as ename
, (
select count(*)
from emp sub
connect by sub.mgr = prior sub.empno
) subs
from   emp
connect by mgr = prior empno
order siblings by empno;``````

The subquery calculates the hierarchy for every row in the table. So it queries EMP once for each row in the table. This leads to a huge amount of extra work. You can see this by getting its execution plan:

``````select /*+ gather_plan_statistics */empno
, lpad(' ', (level-1)*2) || ename as ename
, (
select count(*)
from emp sub
connect by sub.mgr = prior sub.empno
) subs
from   emp
connect by mgr = prior empno
order siblings by empno;

select *
from   table(dbms_xplan.display_cursor(:LIVESQL_LAST_SQL_ID, format => 'ROWSTATS LAST'));``````

Note the FULL TABLE SCAN at line three happens 14 times. Reading a total of 196 rows. As you add more rows to EMP, this query will scale terribly.

• Module 14

## Counting Child Nodes - Pattern Matching

You can overcome the performance issues for the previous query with this algorithm:

• Create the hierarchy returning the rows using depth first search.
• Add a row number to the results showing which order this returns rows
• Walk through the tree in this order
• For each row, count the number of rows after it which are at a lower depth (the LEVEL is greater)

This uses the fact that when using depth-first search, all the children of a node will be at a lower depth. The next node that is the same depth or higher as the current is not a child.

You can implement this in MATCH_RECOGNIZE with these clauses:

```      pattern ( strt higher* )
define
higher as higher.lvl > strt.lvl```

Like searching for consecutive rows, this starts with an always true variable. Then looks for zero or more rows which are at a greater depth.

A key difference is this clause:

`   after match skip to next row`

After matching the pattern for the first row, this instructs the database to repeat the process for the second row in the data set. Then the third, fourth, etc.

This contrasts with the default behaviour for MATCH_RECOGNIZE: after completing a pattern, continue the search from the last matched row. Because all rows are children of the root, the default would match every row. Then have nothing left to match! So it only returns the count of the child for KING!

All together this gives:

``````with hierarchy as (
select lvl, empno, ename, rownum as rn
from (
select level as lvl, empno, ename
from   emp
start  with mgr is null
connect by mgr = prior empno
order siblings by empno
)
)
select /*+ gather_plan_statistics */empno
, lpad(' ', (lvl-1)*2) || ename as ename
, subs
from hierarchy
match_recognize (
order by rn
measures
strt.rn as rn
, strt.lvl as lvl
, strt.empno as empno
, strt.ename as ename
, count(higher.lvl) as subs
one row per match
pattern ( strt higher* )
define
higher as higher.lvl > strt.lvl
)
order by rn;

select *
from   table(dbms_xplan.display_cursor(:LIVESQL_LAST_SQL_ID, format => 'ROWSTATS LAST'));``````

Notice that now the pattern only reads EMP once, for a total of 14 rows read. This scales significantly better than using a subquery!