singletongue commited on
Commit
17276eb
·
verified ·
1 Parent(s): aaaa32a

Fix a bug where input text is not unicode-normalized

Browse files
Files changed (1) hide show
  1. app.py +7 -2
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import re
 
2
  from pathlib import Path
3
 
4
  import gradio as gr
@@ -72,13 +73,17 @@ class MecabTokenizer:
72
  mecab_tokenizer = MecabTokenizer()
73
 
74
 
 
 
 
 
75
  def get_texts_from_file(file_path):
76
  texts = []
77
  with open(file_path) as f:
78
  for line in f:
79
  line = line.strip()
80
  if line:
81
- texts.append(line)
82
 
83
  return texts
84
 
@@ -214,7 +219,7 @@ with gr.Blocks() as demo:
214
  similar_entities = gr.State([])
215
 
216
  text_input = gr.Textbox(label="Input Text")
217
- text_input.change(fn=lambda text: [text], inputs=text_input, outputs=texts)
218
  texts_file = gr.File(label="Input Texts")
219
  texts_file.change(fn=get_texts_from_file, inputs=texts_file, outputs=texts)
220
  topk_input = gr.Number(5, label="Top K", interactive=True)
 
1
  import re
2
+ import unicodedata
3
  from pathlib import Path
4
 
5
  import gradio as gr
 
73
  mecab_tokenizer = MecabTokenizer()
74
 
75
 
76
+ def normalize_text(text: str) -> str:
77
+ return unicodedata.normalize("NFKC", text)
78
+
79
+
80
  def get_texts_from_file(file_path):
81
  texts = []
82
  with open(file_path) as f:
83
  for line in f:
84
  line = line.strip()
85
  if line:
86
+ texts.append(normalize_text(line))
87
 
88
  return texts
89
 
 
219
  similar_entities = gr.State([])
220
 
221
  text_input = gr.Textbox(label="Input Text")
222
+ text_input.change(fn=lambda text: [normalize_text(text)], inputs=text_input, outputs=texts)
223
  texts_file = gr.File(label="Input Texts")
224
  texts_file.change(fn=get_texts_from_file, inputs=texts_file, outputs=texts)
225
  topk_input = gr.Number(5, label="Top K", interactive=True)