create or replace package body gestalt_pkg
IS
l_min_threshold SIMPLE_FLOAT := 0;
FUNCTION lcs(
in_str1 IN VARCHAR2,
in_str2 IN VARCHAR2
) RETURN VARCHAR2
IS
PRAGMA UDF;
TYPE numbers_array IS TABLE OF PLS_INTEGER INDEX BY PLS_INTEGER;
TYPE Array2D IS TABLE OF numbers_array INDEX BY PLS_INTEGER;
c_l1 SIMPLE_INTEGER := 0;
c_l2 SIMPLE_INTEGER := 0;
p_matrix Array2D;
p_c1 NCHAR(1);
p_c2 NCHAR(1);
endingIndex SIMPLE_INTEGER := 0;
maxlen SIMPLE_INTEGER := 0; --lcs
l_check SIMPLE_INTEGER := 0; --check if there are any matching characters between the 2 strings
BEGIN
IF in_str1 IS NULL OR in_str2 IS NULL THEN
RETURN NULL;
ELSIF c_l1 > c_l2 THEN
RETURN lcs( in_str2, in_str1 );
END IF;
--check if there are any matching characters; if none found, RETURN NULL
--@todo: might consider removing the whitespaces before making the comparison
SELECT
COUNT(1)
INTO
l_check
FROM
(
SELECT in_str1 initial_string, TRANSLATE ( in_str1, in_str2, '#' ) output_string FROM dual
)
WHERE
initial_String = output_string
;
IF l_check = 1
THEN
RETURN '';
END IF;
c_l1 := LENGTH(in_str1);
c_l2 := LENGTH(in_str2);
--initialize all cells of lookup table to 0
/*
FOR i IN 1..c_l1
LOOP
FOR j IN 1..c_l2
LOOP
p_matrix(i)(j) := 0;
END LOOP;
END LOOP;
*/
FOR i IN 1..c_l1
LOOP
--Check if the length of current position + length of remaining string <= max length found, exit
p_c1 := SUBSTR( in_str1, i, 1 );
FOR j IN 1..c_l2
LOOP
p_c2 := SUBSTR( in_str2, j, 1 );
--if current character of X and Y matches
IF p_c1 = p_c2
THEN
if p_matrix.exists ( i-1 ) and p_matrix ( i-1 ).exists ( j-1 )
then
p_matrix ( i )( j ) := COALESCE(p_matrix ( i-1 )( j-1 ), 0) + 1;
else
p_matrix ( i )( j ) := 1;
end if;
IF p_matrix(i)(j) > maxlen
THEN
maxlen := p_matrix(i)(j);
endingIndex := i;
END IF;
--if there are fewer chars remaining in string1 than maxlen already found, exit to save some cpu cycles
IF (c_l1 - i + 1) < maxlen
THEN
RETURN SUBSTR(in_str1, endingIndex - maxlen + 1, maxlen);
END IF;
END IF;
END LOOP;
END LOOP;
RETURN SUBSTR(in_str1, endingIndex - maxlen + 1, maxlen);
END lcs;
FUNCTION cs(
in_str1 IN VARCHAR2,
in_str2 IN VARCHAR2
) RETURN FLOAT
AS
PRAGMA UDF;
v_lcs VARCHAR2(123);
ll VARCHAR2(123);
rl VARCHAR2(123);
lr VARCHAR2(123);
rr VARCHAR2(123);
BEGIN
--compute longest common substring
v_lcs := lcs(in_str1, in_str2);
IF v_lcs IS NULL
THEN
RETURN 0;
END IF;
LOOP
ll := SUBSTR( in_str1, 1, INSTR ( in_str1, v_lcs ) - 1 );
lr := SUBSTR( in_str1, INSTR ( in_str1, v_lcs ) + LENGTH ( v_lcs ) );
rl := SUBSTR( in_str2, 1, INSTR ( in_str2, v_lcs ) - 1 );
rr := SUBSTR( in_str2, INSTR ( in_str2, v_lcs ) + LENGTH ( v_lcs ) );
RETURN 2 * (COALESCE(length(v_lcs), 0) + COALESCE(length(lcs(ll, rl)), 0) + COALESCE(length(lcs(lr, rr)), 0)) / ( LENGTH(in_str1) + LENGTH(in_str2) );
END LOOP;
RETURN 0;
END cs;
FUNCTION lcs_sql
(
l IN VARCHAR2
, r IN VARCHAR2
)
RETURN SIMPLE_FLOAT
IS
PRAGMA UDF;
ll VARCHAR2(123);
rl VARCHAR2(123);
lr VARCHAR2(123);
rr VARCHAR2(123);
BEGIN
FOR rec IN
(
SELECT
INSTR(l, SUBSTR(r,r_strt,len)) l_strt
, r_strt
, len
FROM
dual
, LATERAL(SELECT LENGTH(r) + 1 - LEVEL len from dual CONNECT BY LENGTH(r) + 1 - LEVEL > 0) s
, LATERAL(SELECT LEVEL r_strt FROM dual CONNECT BY LEVEL + s.len - 1 <= LENGTH(r)) e
WHERE
INSTR(l, SUBSTR(r,e.r_strt,len)) > 0
AND ROWNUM <= 1
)
LOOP
ll := SUBSTR(l, 1, rec.l_strt - 1);
rl := SUBSTR(r, 1, rec.r_strt - 1);
lr := SUBSTR(l, rec.l_strt + rec.len);
rr := SUBSTR(r, rec.r_strt + rec.len);
RETURN rec.len + lcs_sql(ll, rl) + lcs_sql(lr, rr);
END LOOP;
RETURN 0;
END lcs_sql;
FUNCTION ratcliff_obershelp_alg(
in_str1 in varchar2,
in_str2 in varchar2
) RETURN SIMPLE_FLOAT
IS
PRAGMA UDF;
BEGIN
RETURN 2 * lcs_sql(in_str1, in_str2) / ( LENGTH(in_str1) + LENGTH(in_str2) );
END ratcliff_obershelp_alg;
PROCEDURE test_similarity
(
p_string IN VARCHAR2
)
IS
TYPE t_results is RECORD (obj_id INTEGER, object_name VARCHAR2(123));
TYPE results_tab is TABLE OF t_results INDEX BY PLS_INTEGER;
l_results results_tab;
TYPE t_results2 is RECORD (obj_id INTEGER, object_name VARCHAR2(123), score FLOAT);
TYPE results2_tab is TABLE OF t_results2 INDEX BY PLS_INTEGER;
l_results2 results2_tab := results2_tab();
CURSOR cref
IS
SELECT obj_id, object_name
FROM
(
select object_id+rownum AS obj_id, object_name from all_objects
, ( select 1 from dual connect by level <= 2 ) t
)
;
i SIMPLE_INTEGER := 1;
l_score SIMPLE_FLOAT := 0;
l_time NUMBER;
BEGIN
l_time := dbms_utility.get_time;
OPEN cref;
FETCH cref BULK COLLECT INTO l_results;
CLOSE cref;
FOR indx in 1..l_results.COUNT
LOOP
--SQL version
--l_score := ratcliff_obershelp_alg(PDenomination, l_results(indx).deno);
--PL/SQL version
l_score := cs(p_string, l_results(indx).object_name);
IF l_score >= l_min_threshold
THEN
l_results2(i).obj_id := l_results(indx).obj_id;
l_results2(i).object_name := l_results(indx).object_name;
l_results2(i).score := l_score;
i := i + 1;
END IF;
END LOOP;
dbms_output.put_line('[' || l_results2.COUNT || '] strings selected out of pool of [' || l_results.COUNT || '] in ' || (dbms_utility.get_time - l_time)/100 || ' seconds.');
END test_similarity;
BEGIN
l_min_threshold:= 0.65;
END gestalt_pkg;