Last active
April 1, 2020 13:43
-
-
Save fjavier/586c713943d76a023a70 to your computer and use it in GitHub Desktop.
Revisions
-
fjavier revised this gist
Dec 23, 2014 . 1 changed file with 40 additions and 21 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -34,6 +34,9 @@ $BODY$ v_is_prefix boolean; -- bandera que controla el numero de prefijos v_count_prefix integer; -- almacena el numero de prefijos BEGIN v_is_prefix = true; v_count_prefix = 0; v_string_match2_disorderly = ''; @@ -55,16 +58,19 @@ $BODY$ v_arreglo_cadena1 = string_to_array(v_format_cadena1,null); v_arreglo_cadena2 = string_to_array(v_format_cadena2, null); <<outerloop>> FOREACH v_iterator_cadena2 IN ARRAY v_arreglo_cadena2 LOOP --Reiniciamos el contador de la cadena1 v_position_cadena1 = 1; <<innerloop>> --Recorremos la cadena 1 en busca de coincidencias. FOREACH v_iterator_cadena1 IN ARRAY v_arreglo_cadena1 LOOP --Verificamos si el caracter encontrado es nulo hacemos un continue IF v_iterator_cadena1 IS NOT NULL THEN -- Verificamos si el caracter de la cadena 2 es igual al caracter de la cadena 1 y si se encuentran en el rango de coincidencias IF v_iterator_cadena2 = v_iterator_cadena1 AND (ABS(v_position_cadena1 - v_position_cadena2) BETWEEN 0 AND v_match_range) @@ -100,35 +106,43 @@ $BODY$ v_position_cadena2 = v_position_cadena2 + 1; END LOOP; /* RAISE NOTICE 'itera match, %',array(select unnest(v_string_match1) order by 1);*/ IF v_count_prefix IS NULL then v_count_prefix=0; END IF; --Se ordenan las coincidencias de la primera cadena y se ordenan mediante su real posicion en el arreglo IF array(select unnest(v_string_match1) order by 1) is not null then FOREACH v_itera_match IN ARRAY (select array(select unnest(v_string_match1) order by 1)) LOOP v_string_match1_arranged = v_string_match1_arranged || substring(v_itera_match from '.$'); END LOOP ; end if; /*RAISE NOTICE 'itera match2, %',v_string_match2;*/ IF v_string_match2 IS NOT NULL THEN --Se ordenan las coincidencias de la segunda cadena segun al orden encontrado en la iteracion FOREACH v_itera_match IN ARRAY v_string_match2 LOOP v_string_match2_disorderly = v_string_match2_disorderly || substring(v_itera_match from '.$'); END LOOP ; END IF; --Se valida en caso de que no exista coincidencia alguna IF v_string_match2 IS NOT NULL AND v_string_match1 IS NOT NULL THEN --Se encuentran el numero de Transposiciones. IF v_string_match1_arranged != v_string_match2_disorderly THEN FOR iterator IN 1..length(v_string_match1_arranged) LOOP IF (select array(select unnest(v_string_match1) order by 1))[iterator] != v_string_match2[iterator] THEN v_transposition = v_transposition + 1 ; END IF ; END LOOP ; END IF; --TODO: revisar si realmente require redondeo v_transposition = CASE WHEN v_transposition!=0 then floor(v_transposition / 2) ELSE v_transposition END; --Calculando jaro-distance v_jaro_distance = (1::NUMERIC/3::NUMERIC)*((v_match::NUMERIC /length(cadena1)::NUMERIC )+(v_match::NUMERIC/length(cadena2)::NUMERIC )+((v_match::NUMERIC -v_transposition::NUMERIC )::NUMERIC/v_match::NUMERIC)); --calculando jaro-winkler v_jaro_winkler = v_jaro_distance + ( v_count_prefix * 0.1 * (1 - v_jaro_distance ) ); /* RAISE NOTICE 'PREFIJOS: %' , v_count_prefix; RAISE NOTICE 'RANGO DE COINCIDENCIAS: %', v_match_range; RAISE NOTICE 'TRANSPOSICIONES: %',v_transposition; @@ -141,6 +155,11 @@ $BODY$ RAISE NOTICE 'CADENA1 FORMADA POR LAS COINCIDENCIAS: %', v_string_match1_arranged; RAISE NOTICE 'CADENA2 FORMADA POR LAS COINCIDENCIAS: %', v_string_match2_disorderly; */ --Si la coincidencia es nula entonces retornamos a 0 ELSE v_jaro_winkler=0; END IF; return round(v_jaro_winkler,2); END; -
fjavier revised this gist
Sep 23, 2014 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -3,6 +3,7 @@ create or replace function fn_porcentaje_similitud(cadena1 character varying, ca @author: fbriceno @descripcion: devuelve el porcentaje de similitud entre 2 cadenas, haciendo uso del algoritmo jaro-winkler @return: porcentaje de similitud entre 0 y 1. entre mas cercano al 1 mayor coincidencia. mas cercano al 0 poco coincidente @documentation: https://gist.github.com/fjavier/586c713943d76a023a70 */ returns numeric as $BODY$ -
fjavier revised this gist
Sep 23, 2014 . 1 changed file with 8 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,9 @@ create or replace function fn_porcentaje_similitud(cadena1 character varying, cadena2 character varying ) /* @author: fbriceno @descripcion: devuelve el porcentaje de similitud entre 2 cadenas, haciendo uso del algoritmo jaro-winkler @return: porcentaje de similitud entre 0 y 1. entre mas cercano al 1 mayor coincidencia. mas cercano al 0 poco coincidente */ returns numeric as $BODY$ DECLARE @@ -40,7 +45,7 @@ $BODY$ v_format_cadena1 = UPPER(replace(cadena1,' ','')); v_format_cadena2 = UPPER(replace(cadena2,' ','')); -- Se calcula el tamaño maximo entre las 2 cadenas v_max_length = case when length(v_format_cadena1)>=length(v_format_cadena2) then length(v_format_cadena1) else length(v_format_cadena2) end; @@ -127,9 +132,9 @@ $BODY$ RAISE NOTICE 'RANGO DE COINCIDENCIAS: %', v_match_range; RAISE NOTICE 'TRANSPOSICIONES: %',v_transposition; RAISE NOTICE 'PASO 1:'; RAISE NOTICE 'coincidencia: % / cadena1_tam: % = %',v_match, length(cadena1), v_match::numeric/length(cadena1)::numeric; RAISE NOTICE 'PASO 2:'; RAISE NOTICE 'coincidencia: % / cadena2_tam: % = %',v_match, length(cadena2), v_match::numeric/length(cadena2)::numeric; RAISE NOTICE 'PASO 3:'; RAISE NOTICE '(coincidencia: % - transpociciones: %) / coincidencias : % = %',v_match, v_transposition, v_match, (v_match - v_transposition)::numeric/v_match::numeric; -
fjavier revised this gist
Sep 19, 2014 . 1 changed file with 107 additions and 32 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -7,61 +7,136 @@ $BODY$ v_max_length integer; -- tamaño maximo de las 2 cadenas v_match_range integer; -- Distancia usada para hacer la transposicion v_transposition integer; --almacena el numero de transposiciones v_jaro_distance numeric; v_jaro_winkler numeric; v_arreglo_cadena1 char[]; -- para almacenar la cadena convertida en arreglo v_arreglo_cadena2 char[]; -- para almacenar la cadena2 convertida en arreglo v_iterator_cadena1 char; -- para uso en iteracion de la cadena 1 v_position_cadena1 integer; -- Controla las posiciones de las coincidencias en la Cadena1 v_iterator_cadena2 char; -- para uso en iteracion de la cadena2 v_position_cadena2 integer; -- Controla las posiciones de las coincidencias en la Cadena2 v_match integer; --Guardamos el numero de coincidencias encontradas v_string_match1 character varying[]; --almacena como cadena las coincidencias encontradas v_string_match2 character varying[]; --almacena como cadena las coincidencias encontradas en la cadena2 v_itera_match character varying; -- itera los arreglos almacenados como coincidencias v_string_match1_arranged character varying; -- se almacena primera cadena ordenada v_string_match2_disorderly character varying; -- se almacena la segunda cadena en el orden encontrado v_is_prefix boolean; -- bandera que controla el numero de prefijos v_count_prefix integer; -- almacena el numero de prefijos BEGIN v_is_prefix = true; v_count_prefix = 0; v_string_match2_disorderly = ''; v_string_match1_arranged = ''; v_position_cadena1 = 1; v_position_cadena2 = 1; v_match = 0; v_transposition = 0; v_format_cadena1 = UPPER(replace(cadena1,' ','')); v_format_cadena2 = UPPER(replace(cadena2,' ','')); -- Se calcula el tamaño maximo entre las 2 cadenas v_max_length = case when length(v_format_cadena1)>=length(v_format_cadena2) then length(v_format_cadena1) else length(v_format_cadena2) end; -- Se calcula le maxima distancia para decidir si el caracter coincide v_match_range = floor((v_max_length/2))-1; v_arreglo_cadena1 = string_to_array(v_format_cadena1,null); v_arreglo_cadena2 = string_to_array(v_format_cadena2, null); <<outerloop>> FOREACH v_iterator_cadena2 IN ARRAY v_arreglo_cadena2 LOOP --Reiniciamos el contador de la cadena1 v_position_cadena1 = 1; <<innerloop>> --Recorremos la cadena 1 en busca de coincidencias. FOREACH v_iterator_cadena1 IN ARRAY v_arreglo_cadena1 LOOP --Verificamos si el caracter encontrado es nulo hacemos un continue IF v_iterator_cadena1 IS NOT NULL THEN -- Verificamos si el caracter de la cadena 2 es igual al caracter de la cadena 1 y si se encuentran en el rango de coincidencias IF v_iterator_cadena2 = v_iterator_cadena1 AND (ABS(v_position_cadena1 - v_position_cadena2) BETWEEN 0 AND v_match_range) THEN --verificamos si es prefijo incrementamos el contador de prefijos IF (ABS(v_position_cadena1 - v_position_cadena2) = 0) and v_is_prefix THEN v_count_prefix = CASE WHEN v_count_prefix<4 THEN v_count_prefix+1 END; ELSE v_is_prefix = false; END IF; -- Guardamos las coincidencias encontradas select ARRAY_APPEND(v_string_match1,(v_position_cadena1||v_iterator_cadena1)::CHARACTER VARYING ) INTO v_string_match1; select ARRAY_APPEND(v_string_match2,(v_position_cadena1||v_iterator_cadena1)::CHARACTER VARYING ) INTO v_string_match2; v_arreglo_cadena1[v_position_cadena1] = NULL; v_position_cadena1 = v_position_cadena1 + 1; --Se registra la coincidencia v_match = v_match+1; --Salimos de la iteracion de la cadena 1 EXIT innerloop; ELSE --Se incrementa el contador, aunque no sea una coincidencia. v_position_cadena1 = v_position_cadena1 + 1; v_is_prefix = false; END IF; ELSE --Se incrementa el contador, aunque se haya encontrado Nulo v_position_cadena1 = v_position_cadena1 + 1; CONTINUE; END IF; END LOOP; v_position_cadena2 = v_position_cadena2 + 1; END LOOP; --Se ordenan las coincidencias de la primera cadena y se ordenan mediante su real posicion en el arreglo FOREACH v_itera_match IN ARRAY (select array(select unnest(v_string_match1) order by 1)) LOOP v_string_match1_arranged = v_string_match1_arranged || substring(v_itera_match from '.$'); END LOOP ; --Se ordenan las coincidencias de la segunda cadena segun al orden encontrado en la iteracion FOREACH v_itera_match IN ARRAY v_string_match2 LOOP v_string_match2_disorderly = v_string_match2_disorderly || substring(v_itera_match from '.$'); END LOOP ; --Se encuentran el numero de Transposiciones. IF v_string_match1_arranged != v_string_match2_disorderly THEN FOR iterator IN 1..length(v_string_match1_arranged) LOOP IF (select array(select unnest(v_string_match1) order by 1))[iterator] != v_string_match2[iterator] THEN v_transposition = v_transposition + 1 ; END IF ; END LOOP ; END IF; --TODO: revisar si realmente require redondeo v_transposition = CASE WHEN v_transposition!=0 then floor(v_transposition / 2) ELSE v_transposition END; --Calculando jaro-distance v_jaro_distance = (1::NUMERIC/3::NUMERIC)*((v_match::NUMERIC /length(cadena1)::NUMERIC )+(v_match::NUMERIC/length(cadena2)::NUMERIC )+((v_match::NUMERIC -v_transposition::NUMERIC )::NUMERIC/v_match::NUMERIC)); --calculando jaro-winkler v_jaro_winkler = v_jaro_distance + ( v_count_prefix * 0.1 * (1 - v_jaro_distance ) ); RAISE NOTICE 'PREFIJOS: %' , v_count_prefix; RAISE NOTICE 'RANGO DE COINCIDENCIAS: %', v_match_range; RAISE NOTICE 'TRANSPOSICIONES: %',v_transposition; RAISE NOTICE 'PASO 1:'; RAISE NOTICE 'coincidencia: % / cadena1_tam: % = %',v_match, length(cadena1), v_match/length(cadena1); RAISE NOTICE 'PASO 2:'; RAISE NOTICE 'coincidencia: % / cadena2_tam: % = %',v_match, length(cadena2), v_match/length(cadena2); RAISE NOTICE 'PASO 3:'; RAISE NOTICE '(coincidencia: % - transpociciones: %) / coincidencias : % = %',v_match, v_transposition, v_match, (v_match - v_transposition)::numeric/v_match::numeric; RAISE NOTICE 'CADENA1 FORMADA POR LAS COINCIDENCIAS: %', v_string_match1_arranged; RAISE NOTICE 'CADENA2 FORMADA POR LAS COINCIDENCIAS: %', v_string_match2_disorderly; return round(v_jaro_winkler,2); END; $BODY$ language plpgsql VOLATILE SECURITY DEFINER; -
fjavier created this gist
Sep 18, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,69 @@ create or replace function fn_porcentaje_similitud(cadena1 character varying, cadena2 character varying ) returns numeric as $BODY$ DECLARE v_format_cadena1 character varying; --Se formatea la cadena, se eliminan los espacios v_format_cadena2 character varying; --Se formatea la cadena, se eliminan los espacios v_max_length integer; -- tamaño maximo de las 2 cadenas v_match_range integer; -- Distancia usada para hacer la transposicion v_transposition integer; v_jaro_proximity numeric; v_arreglo_cadena1 char[]; v_arreglo_cadena2 char[]; v_iterator_cadena1 char; v_position_cadena1 integer; v_iterator_cadena2 char; v_position_cadena2 integer; v_match integer; v_transposition integer; v_last_position integer; BEGIN v_last_position = 0; v_position_cadena1 = 0; v_position_cadena2 = 0; v_format_cadena1 = UPPER(replace(cadena1,' ','')); v_format_cadena2 = UPPER(replace(cadena2,' ','')); v_max_length = case when length(v_format_cadena1)>=length(v_format_cadena2) then length(v_format_cadena1) else length(v_format_cadena2) end; v_match_range = floor((v_max_length/2))-1; v_arreglo_cadena1 = string_to_array(v_format_cadena1,null); v_arreglo_cadena2 = string_to_array(v_format_cadena2, null); <<outerloop>> FOREACH v_iterator_cadena2 IN ARRAY v_arreglo_cadena2 LOOP raise notice 'Letra cadena2: %', v_iterator_cadena2; <<innerloop>> FOREACH v_iterator_cadena1 IN ARRAY v_arreglo_cadena1 LOOP raise notice 'Letra cadena1: %', v_iterator_cadena1; if v_iterator_cadena2 = v_iterator_cadena1 then raise notice 'Encontrado: %',v_iterator_cadena1; v_position_cadena1 = v_position_cadena1 + 1; exit innerloop; else continue; end if ; v_position_cadena1 = v_position_cadena1 + 1; END LOOP; raise notice 'sali del primer looop %',v_position_cadena2; v_position_cadena2 = v_position_cadena2 + 1; END LOOP; return v_match_range; END; $BODY$ language plpgsql VOLATILE SECURITY DEFINER; alter function fn_porcentaje_similitud(character varying, character varying) owner to sa;