Matheus de Oliveira
<matioli.matheus@gmail.com>
2019-08-03
Apresentação dinâmica:
http://matheusoliveira.s3-website-us-east-1.amazonaws.com/presentations/pgconfbr-2019-fts/
SlideShare (PDF):
https://www.slideshare.net/matheus_de_oliveira/postgresql-full-text-search-com-fuzzy
|
Basicamente você consegue fazer buscas textuais usando:
pg_trgm
para indexação!tsvector
com a função to_tsvector
SELECT to_tsvector('O rato roeu a roupa do rei de roma');
to_tsvector
-------------------------------------------
'rat':2 'rei':7 'roeu':3 'rom':9 'roup':5
(1 row)
SELECT to_tsvector('english', 'The quick brown fox jumped over the lazy dog.');
to_tsvector
-------------------------------------------------------
'brown':3 'dog':9 'fox':4 'jump':5 'lazi':8 'quick':2
(1 row)
to_tsquery
SELECT to_tsquery('rato & roma');
to_tsquery
---------------
'rat' & 'rom'
(1 row)
plainto_tsquery
SELECT plainto_tsquery('rato roma');
plainto_tsquery
-----------------
'rat' & 'rom'
(1 row)
phraseto_tsquery
:
SELECT phraseto_tsquery('rei de roma');
phraseto_tsquery
------------------
'rei' <2> 'rom'
(1 row)
websearch_to_tsquery
:
SELECT websearch_to_tsquery('roupa "rei de roma"');
websearch_to_tsquery
--------------------------
'roup' & 'rei' <2> 'rom'
(1 row)
tsvetor
casa com um tsquery
SELECT
to_tsvector('O rato roeu a roupa do rei de Roma')
@@ phraseto_tsquery('rei de roma');
?column?
----------
t
(1 row)
Existem basicamente 4 tipos de configurações:
Templates disponíveis:
$ SHAREDIR="$( pg_config --sharedir )"
$ TSEARCH_DATA="${SHAREDIR}/tsearch_data/"
$ sudo cp $TSEARCH_DATA/synonym_sample.syn $TSEARCH_DATA/my_synonym.syn
$ cat $TSEARCH_DATA/my_synonym.syn
CREATE TEXT SEARCH CONFIGURATION my_portuguese(COPY = portuguese);
\dF+ my_portuguese
CREATE TEXT SEARCH DICTIONARY my_synonym(
TEMPLATE = synonym,
SYNONYMS = my_synonym -- nome do arquivo com extensão .syn
);
ALTER TEXT SEARCH CONFIGURATION my_portuguese
ALTER MAPPING FOR asciiword, asciihword, hword, hword_part, word
WITH my_synonym, portuguese_stem;
SELECT count(*) FROM products
WHERE to_tsvector('portuguese', name) @@
to_tsquery('portuguese', 'energetico');
count
-------
0
(1 row)
CREATE EXTENSION unaccent;
ALTER TEXT SEARCH CONFIGURATION my_portuguese
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, my_synonym, portuguese_stem;
SELECT count(*) FROM products
WHERE to_tsvector('my_portuguese', name) @@
to_tsquery('my_portuguese', 'energetico');
count
-------
3
(1 row)
sudo cp $TSEARCH_DATA/thesaurus_sample.ths \
$TSEARCH_DATA/my_thesaurus.ths
CREATE TEXT SEARCH DICTIONARY my_thesaurus(
TEMPLATE = thesaurus,
DictFile = my_thesaurus, -- nome do arquivo com extensão .ths
Dictionary = pg_catalog.portuguese_stem -- subdicionário
);
ALTER TEXT SEARCH CONFIGURATION my_portuguese
ALTER MAPPING FOR asciiword, asciihword, hword, hword_part, word
WITH my_thesaurus, portuguese_stem;
ALTER TEXT SEARCH CONFIGURATION my_portuguese
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, my_thesaurus, portuguese_stem;
São basicamente os dicionários que já usamos por padrão, e são os responsáveis por stemming.
SELECT count(*) FROM products
WHERE to_tsvector(name) @@ plainto_tsquery('ceveja');
count
-------
0
(1 row)
CREATE TEXT SEARCH CONFIGURATION my_simple (COPY = simple);
ALTER TEXT SEARCH CONFIGURATION my_simple
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, simple;
CREATE MATERIALIZED VIEW IF NOT EXISTS fts_stats_products AS
SELECT
q.query, st.*
FROM
ts_stat(
$$SELECT to_tsvector('my_simple', name) FROM products$$
) AS st,
nullif(to_tsquery('my_portuguese',
quote_literal(st.word)), '') AS q(query)
WHERE q.query IS NOT NULL
UNION ALL
SELECT * FROM
(VALUES(to_tsquery('my_portuguese', 'Budweiser'), 'bud', -1, -1))
AS syn(query, word, ndoc, nentry)
;
CREATE UNIQUE INDEX IF NOT EXISTS fts_stats_products_word_u_idx
ON fts_stats_products(word);
CREATE INDEX IF NOT EXISTS fts_stats_products_word_idx
ON fts_stats_products USING gist(word gist_trgm_ops);
CREATE OR REPLACE FUNCTION levenshtein_distance(str1 text, str2 text)
RETURNS float8
LANGUAGE SQL AS $$
SELECT
1-levenshtein(str1, str2)/
(1e-20 + length(str1) + length(str2)
- levenshtein(str1, str2))::float4
;
$$;
CREATE OR REPLACE FUNCTION my_plainto_tsquery(source text) RETURNS tsquery IMMUTABLE LANGUAGE SQL AS $$ SELECT string_agg( '(' || concat_ws(' | ', queries.query_source::text, queries.queries_or) || ')', ' & ' )::tsquery AS query FROM unnest(to_tsvector('my_simple', source)) AS usr
CROSS JOIN LATERAL( SELECT to_tsquery('my_portuguese', quote_literal(usr.lexeme)) AS query_source, string_agg(matches.query::text, ' | ') AS queries_or FROM ( SELECT s.* FROM fts_stats_products s WHERE s.word % usr.lexeme AND levenshtein_distance(s.word, usr.lexeme) > 0.8 ORDER BY s.word <-> usr.lexeme LIMIT 50 ) AS matches ) AS queries ; $$;
SELECT count(*) FROM products
WHERE to_tsvector(name) @@ my_plainto_tsquery('ceveja');
count
-------
3
(1 row)
Matheus de Oliveira
matioli.matheus@gmail.com
irc.freenode.net - /join #postgresql,#postgresql-br
: @MatheusOl
Twitter: @matioli_matheus
LinkedIn: br.linkedin.com/in/matheusdeoliveira/
SlideShare: slideshare.net/matheus_de_oliveira