PostgreSQL black technology - recursive dichotomy for Chinese character segmentation
This article is PostgreSQL fuzzy query In the improved version, the number and English words are extracted separately without segmentation.
1. Create a punctuation function in the clear text
At present, the punctuation I can think of is the following, welcome to add.
--Do not erase spaces, underscores, minus signs
drop function if exists clear_punctuation(text);
create or replace function clear_punctuation(text)
returns text
as $$
select regexp_replace($1,
'[\~|\`|\!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\+|\=|\||\\|\[|\]|\{|\}|\;|\:|\"|\''|\,|\<|\.|\>|\/|\?|\: |\. |\;|\,|\: |\"|\"|\(|\)|\,|\?|\<|\>]'
,'','g');
$$ language sql strict immutable;
2. Black technology - segmentation of Chinese characters by recursive dichotomy
Every two characters of Chinese characters are treated as one word. For example, after "call function" is segmented, the result is three words, which are "call", "function" and "function".
drop function if exists dichotomy_split_sql(text);
create or replace function dichotomy_split_sql(text)
returns table(val varchar(2))
as $$
with recursive cte(pos,val) as(
values( 2,substring($1,1,2))
union all
select (pos+1) as pos,(select substring($1,pos,2) ) as val
from cte as rec(pos,val) where char_length(val) = 2
)select val from cte where char_length(val) = 2;
$$ language sql;
3. Create array and convert to row function
drop function if exists array_to_rows(text[]);
create or replace function array_to_rows(text[])
returns table(val text)
as $$
select val from unnest( $1 ) AS val;
$$ language sql;
4. Extract numbers, English words and segmentation results of Chinese characters
drop function if exists get_matche_results(text);
create or replace function get_matche_results(text)
returns table(val text)
as $$
with num_en as(
--Get numbers, English words
select c from regexp_matches($1, '-?\d+|[a-zA-Z_]+','g') as c
),txt as(
--Extract all characters except numbers, English words, and spaces
select regexp_replace(d,'[\ ]','','g') as d from regexp_split_to_table($1, '-?\d+|[a-zA-Z_\ ]+') as d
),arr as(
select * from num_en
union all
select (select array_agg(x) from dichotomy_split_sql(d) as x) from txt where d<>''
) select array_to_rows(val) as h from arr as f(val) group by h order by h;
$$ language sql;
--usage method
select * from get_matche_results(clear_punctuation('Call function SRF_PERCALL_SETUP() and work For use FuncCallContext Make proper settings and clear-2239768 Any previous wheel 9987 returns the returned data under the inner body.'));
5. Convert to tsvector
drop function if exists dichotomy_split_tsv(text);
create or replace function dichotomy_split_tsv(text)
returns tsvector
as $$
select array_to_tsvector(array_agg(val)) from get_matche_results($1) as val;
$$ language sql;
--usage method
select * from dichotomy_split_tsv(clear_punctuation('Call function SRF_PERCALL_SETUP() and work For use FuncCallContext Make proper settings and clear-2239768 Any previous wheel 9987 returns the returned data under the inner body.'));
6. Convert to tsquery
drop function if exists dichotomy_split_tsq(text,boolean);
create or replace function dichotomy_split_tsq(text,boolean default true)
returns tsquery
as $$
select string_agg(val, (case when $2 then '&' else '|' end ) )::tsquery from get_matche_results($1) as val;
$$ language sql;
--usage method
select * from dichotomy_split_tsq(clear_punctuation('Call function SRF_PERCALL_SETUP() and work For use FuncCallContext Make proper settings and clear-2239768 Any previous wheel 9987 returns the returned data under the inner body.'));