- elasticsearch
- 大而全
- 支持分布式
- MeiliSearch
- 原生支持中文(jieba)
- 不支持分布式
- 基于 AI 的搜索
此处使用 Meilisearch
使用
python
import json
from pathlib import Path
import meilisearch
from meilisearch.errors import MeilisearchApiError
try:
client = meilisearch.Client(
"http://localhost:7700", "master"
)
except MeilisearchApiError as e:
print(e)
raise
index = client.index("movies")
# https://www.meilisearch.com/docs/reference/api/settings
index.update_filterable_attributes(["title", "genres"]) # 允许过滤的属性
index.update_sortable_attributes(["title"]) # 允许排序的属性
index.update_stop_words(["of", "the"]) # 停用词(过滤掉)
index.update_pagination_settings({"maxTotalHits": 500}) # 限制搜索结果的数量
# 中文同义词 https://github.com/jaaack-wang/Chinese-Synonyms
synon = Path(__file__).parent / "synonyms.json"
with synon.open() as f:
res = json.load(f)
index.update_synonyms(res) # 同义词
index.update_ranking_rules(
[
"words", # 根据匹配数量
"typo", # 拼写错误的在后
"proximity", # 查询词之间的距离
"attribute",
"sort", # 查询时指定排序
"exactness", # 相似度排序
]
)
# 查看设置
# index.get_stop_words()
settings = index.get_settings()
print(settings)
documents = [
{"id": 1, "title": "Carol", "genres": ["Romance", "Drama"]},
{"id": 2, "title": "Wonder Woman", "genres": ["Action", "Adventure"]},
{"id": 3, "title": "Life of Pi", "genres": ["Adventure", "Drama"]},
{
"id": 4,
"title": "Mad Max: Fury Road",
"genres": ["Adventure", "Science Fiction"],
},
{"id": 5, "title": "Moana", "genres": ["Fantasy", "Action"]},
{"id": 6, "title": "Philadelphia", "genres": ["Drama"]},
]
# 添加数据
task = index.add_documents(documents)
while (status := client.get_task(task.task_uid).status) != "succeeded":
import time
print(status)
print("waiting for indexation")
time.sleep(1)
print("indexation done")
# 查询
# https://www.meilisearch.com/docs/reference/api/search#search-parameters
res = index.search(
"caorl",
{
"limit": 2, # 限制返回的结果数量
"offset": 0, # 跳过前面的结果
"attributesToRetrieve": ["title"], # 仅返回指定的属性
"hitsPerPage": 2, # 限制每页数量
"page": 1, # 页码从开始
"attributesToHighlight": ["title"], # 高亮指定的属性
"attributesToCrop": ["title"], # 截断指定的属性
"cropLength": 10, # 截断长度
"filter": "genres = Action", # 过滤,必须是可过滤的属性
"sort": ["title:desc"], # 排序,必须是可排序的属性
},
) # 具有容错能力 carol
print(res)
# 删除
index.delete_document(1) # 删除指定的文档
index.delete_documents([1, 2, 3]) # 删除指定的文档
index.delete_documents({filter: "genres=action OR genres=adventure"}) # 按条件删除
index.delete_all_documents() # 删除所有
# WARNING 删除 index
index.delete()
# 更新文档
index.update_documents(documents) # 更新
index.add_documents(documents) # 当id存在时更新,不存在时添加
查询结果
python
{
"hits": [
{
"id": 1,
"title": "Carol",
"genre": ["Romance", "Drama"]
}
],
"offset": 0,
"limit": 20,
"processingTimeMs": 1,
"query": "caorl"
}