shigureui's picture
doc
eff5a71
from whoosh.fields import TEXT, SchemaClass, ID
from jieba.analyse import ChineseAnalyzer
from whoosh.index import create_in
import json
analyzer = ChineseAnalyzer()
class ArticleSchema(SchemaClass):
index = ID(stored=True)
原文 = TEXT(stored=True, analyzer=analyzer)
注释 = TEXT(stored=True, analyzer=analyzer)
批判 = TEXT(stored=True, analyzer=analyzer)
章节 = TEXT(stored=True, analyzer=analyzer)
schema = ArticleSchema()
ix = create_in("indexdir", schema, indexname="article_index")
writer = ix.writer()
with open("反孔.json", encoding="utf-8") as json_file:
raw_jsons = json.load(json_file)
for vhjx_item in raw_jsons:
for jvvi_item in vhjx_item[1:]:
print(jvvi_item["index"])
writer.add_document(
index=jvvi_item["index"],
原文=jvvi_item["原文"],
注释=jvvi_item["注释"] if "注释" in jvvi_item else "",
批判=jvvi_item["批判"] if "批判" in jvvi_item else "",
章节=vhjx_item[0],
)
writer.commit()
# init
import gradio as gr
from whoosh.qparser import QueryParser
from whoosh.index import open_dir
import re
from whoosh.query import Term
ix = open_dir("indexdir", indexname="article_index")
searcher = ix.searcher()
def search(query_info):
query = QueryParser("原文", ix.schema).parse(query_info)
results = searcher.search(query)
map_hit = []
for hit in results:
批判文本 = hit.get("批判", "")
matches = re.findall(r"\d+[\·\.]\d+", 批判文本)
map_hit.append(dict(hit))
map_hit[-1]["extra"] = []
for index_ref in matches:
index_ref_normalized = index_ref.replace(".", "·")
term_query = Term("index", index_ref_normalized)
related_results = searcher.search(term_query)
for related_hit in related_results:
map_hit[-1]["extra"].append(dict(related_hit))
return map_hit
def lunyu_search(query):
"""
Search for relevant critical commentary entries based on an input query from the Analects.
This function parses the input query, performs a fuzzy search in the indexed original text field,
and extracts related critiques. If any numeric index references (e.g., '3·2') are found in the
commentary, it will further retrieve related entries using these references.
Args:
query (str): The input text (a line from the Analects, possibly fuzzy or partial) to search.
Returns:
List[dict]: A list of result entries. Each entry contains the original hit and a list of related entries
under the key "extra", retrieved via index references mentioned in the commentary.
"""
return search(query_info=query)
demo = gr.Interface(
fn=lunyu_search,
inputs=gr.Textbox(label="输入部分原文句子"),
outputs=gr.JSON(label="查询结果"),
title="论语批判MCP",
description="输入模糊的论语原文,可以查询到对应的批判内容。",
examples=[
["季氏旅于泰山。"],
["子曰:学而时习之,不亦说乎?"],
["有朋自远方来,不亦乐乎?"],
["三人行,必有我师焉。"],
],
)
if __name__ == "__main__":
res = search("季氏旅于泰山。")
print(res)
demo.launch(mcp_server=True)