Spaces:

studio-ousia
/

luxe-demo

Running on CPU Upgrade

App Files Files Community

singletongue commited on Feb 25

Commit

3f97903

verified ·

1 Parent(s): c1e0e01

Set maximum values for input text length and number of lines in input files

Browse files

Files changed (1) hide show

app.py +34 -18

app.py CHANGED Viewed

@@ -16,6 +16,9 @@ from transformers import AutoModelForPreTraining, AutoTokenizer
 ALIAS_SEP = "|"
 ENTITY_SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[MASK]", "[MASK2]"]
 repo_id = "studio-ousia/luxe"
 revision = "ja-v0.3.1"
@@ -68,10 +71,14 @@ def get_texts_from_file(file_path: str | None):
         try:
             with open(file_path, newline="") as f:
                 reader = csv.DictReader(f, fieldnames=["text"])
-                for row in reader:
                     text = normalize_text(row["text"]).strip()
                     if text != "":
-                        texts.append(text)
         except Exception as e:
             gr.Warning("ファイルを正しく読み込めませんでした。")
             print(e)
@@ -144,7 +151,7 @@ def get_topk_entities_from_texts(
     k: int = 5,
     entity_span_sensitivity: float = 1.0,
     nayose_coef: float = 1.0,
-    entities_are_replaced: bool = False,
 ) -> tuple[list[list[tuple[int, int]]], list[list[str]], list[list[str]], list[list[list[str]]]]:
     model, tokenizer, bm25_tokenizer, bm25_retriever = models
@@ -196,7 +203,7 @@ def get_topk_entities_from_texts(
         if model_outputs.entity_logits is not None:
             span_entity_logits = model_outputs.entity_logits[0, :, :500000]
-            if nayose_coef > 0.0 and not entities_are_replaced:
                 nayose_queries = ["ja:" + text[start:end] for start, end in entity_spans]
                 nayose_query_tokens = bm25_tokenizer.tokenize(nayose_queries)
                 nayose_scores = torch.vstack(
@@ -265,7 +272,11 @@ def get_new_entity_text_pairs_from_file(file_path: str | None) -> list[list[str]
         try:
             with open(file_path, newline="") as f:
                 reader = csv.DictReader(f, fieldnames=["entity", "text"])
-                for row in reader:
                     entity = normalize_text(row["entity"]).strip()
                     text = normalize_text(row["text"]).strip()
                     if entity != "" and text != "":
@@ -281,6 +292,7 @@ def get_new_entity_text_pairs_from_file(file_path: str | None) -> list[list[str]
 def replace_entities(
     models,
     new_entity_text_pairs: list[tuple[str, str]],
     new_num_category_entities: int = 0,
     new_entity_counts: list[int] | None = None,
     new_padding_idx: int = 0,
@@ -314,7 +326,7 @@ def replace_entities(
     for entity, text in new_entity_text_pairs:
         entity_id = tokenizer.entity_vocab[entity]
-        tokenized_inputs = tokenizer(text, return_tensors="pt")
         model_outputs = model(**tokenized_inputs)
         entity_embeddings = model.entity_predictions.transform(model_outputs.last_hidden_state[:, 0])
         new_entity_embeddings_dict[entity_id].append(entity_embeddings[0])
@@ -363,7 +375,7 @@ def replace_entities(
     gr.Info("モデルとトークナイザのエンティティの置き換えが完了しました", duration=5)
-    return True
 with gr.Blocks() as demo:
@@ -381,7 +393,7 @@ with gr.Blocks() as demo:
     texts = gr.State([])
-    entities_are_replaced = gr.State(False)
     topk = gr.State(5)
     entity_span_sensitivity = gr.State(1.0)
@@ -400,12 +412,14 @@ with gr.Blocks() as demo:
     gr.Markdown("## 入力テキスト")
     with gr.Tab(label="直接入力"):
-        text_input = gr.Textbox(label="入力テキスト")
     with gr.Tab(label="ファイルアップロード"):
-        texts_file = gr.File(label="入力テキストファイル")
     with gr.Accordion(label="LUXEのエンティティ語彙を置き換える", open=False):
-        new_entity_text_pairs_file = gr.File(label="エンティティと説明文のCSVファイル")
         new_entity_text_pairs_input = gr.Dataframe(
             # value=sample_new_entity_text_pairs,
             headers=["entity", "text"],
@@ -420,7 +434,9 @@ with gr.Blocks() as demo:
         fn=get_new_entity_text_pairs_from_file, inputs=new_entity_text_pairs_file, outputs=new_entity_text_pairs_input
     )
     replace_entity_button.click(
-        fn=replace_entities, inputs=[models, new_entity_text_pairs_input], outputs=entities_are_replaced
     )
     with gr.Accordion(label="ハイパーパラメータ", open=False):
@@ -442,28 +458,28 @@ with gr.Blocks() as demo:
     texts.change(
         fn=get_topk_entities_from_texts,
-        inputs=[models, texts, topk, entity_span_sensitivity, nayose_coef, entities_are_replaced],
         outputs=[batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
     )
     topk.change(
         fn=get_topk_entities_from_texts,
-        inputs=[models, texts, topk, entity_span_sensitivity, nayose_coef, entities_are_replaced],
         outputs=[batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
     )
     entity_span_sensitivity.change(
         fn=get_topk_entities_from_texts,
-        inputs=[models, texts, topk, entity_span_sensitivity, nayose_coef, entities_are_replaced],
         outputs=[batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
     )
     nayose_coef.change(
         fn=get_topk_entities_from_texts,
-        inputs=[models, texts, topk, entity_span_sensitivity, nayose_coef, entities_are_replaced],
         outputs=[batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
     )
-    entities_are_replaced.change(
         fn=get_topk_entities_from_texts,
-        inputs=[models, texts, topk, entity_span_sensitivity, nayose_coef, entities_are_replaced],
         outputs=[batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
     )

 ALIAS_SEP = "|"
 ENTITY_SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[MASK]", "[MASK2]"]
+MAX_TEXT_LENGTH = 800
+MAX_TEXT_FILE_LINES = 100
+MAX_ENTITY_FILE_LINES = 1000
 repo_id = "studio-ousia/luxe"
 revision = "ja-v0.3.1"
         try:
             with open(file_path, newline="") as f:
                 reader = csv.DictReader(f, fieldnames=["text"])
+                for i, row in enumerate(reader):
+                    if i >= MAX_TEXT_FILE_LINES:
+                        gr.Info(f"{MAX_TEXT_FILE_LINES}行目までのデータを読み込みました。")
+                        break
                     text = normalize_text(row["text"]).strip()
                     if text != "":
+                        texts.append(text[:MAX_TEXT_LENGTH])
         except Exception as e:
             gr.Warning("ファイルを正しく読み込めませんでした。")
             print(e)
     k: int = 5,
     entity_span_sensitivity: float = 1.0,
     nayose_coef: float = 1.0,
+    entity_replaced_counts: bool = False,
 ) -> tuple[list[list[tuple[int, int]]], list[list[str]], list[list[str]], list[list[list[str]]]]:
     model, tokenizer, bm25_tokenizer, bm25_retriever = models
         if model_outputs.entity_logits is not None:
             span_entity_logits = model_outputs.entity_logits[0, :, :500000]
+            if nayose_coef > 0.0 and entity_replaced_counts == 0:
                 nayose_queries = ["ja:" + text[start:end] for start, end in entity_spans]
                 nayose_query_tokens = bm25_tokenizer.tokenize(nayose_queries)
                 nayose_scores = torch.vstack(
         try:
             with open(file_path, newline="") as f:
                 reader = csv.DictReader(f, fieldnames=["entity", "text"])
+                for i, row in enumerate(reader):
+                    if i >= MAX_ENTITY_FILE_LINES:
+                        gr.Info(f"{MAX_ENTITY_FILE_LINES}行目までのデータを読み込みました。")
+                        break
                     entity = normalize_text(row["entity"]).strip()
                     text = normalize_text(row["text"]).strip()
                     if entity != "" and text != "":
 def replace_entities(
     models,
     new_entity_text_pairs: list[tuple[str, str]],
+    entity_replaced_counts: int,
     new_num_category_entities: int = 0,
     new_entity_counts: list[int] | None = None,
     new_padding_idx: int = 0,
     for entity, text in new_entity_text_pairs:
         entity_id = tokenizer.entity_vocab[entity]
+        tokenized_inputs = tokenizer(text[:MAX_TEXT_LENGTH], return_tensors="pt")
         model_outputs = model(**tokenized_inputs)
         entity_embeddings = model.entity_predictions.transform(model_outputs.last_hidden_state[:, 0])
         new_entity_embeddings_dict[entity_id].append(entity_embeddings[0])
     gr.Info("モデルとトークナイザのエンティティの置き換えが完了しました", duration=5)
+    return entity_replaced_counts + 1
 with gr.Blocks() as demo:
     texts = gr.State([])
+    entity_replaced_counts = gr.State(0)
     topk = gr.State(5)
     entity_span_sensitivity = gr.State(1.0)
     gr.Markdown("## 入力テキスト")
     with gr.Tab(label="直接入力"):
+        text_input = gr.Textbox(label=f"入力テキスト（最大{MAX_TEXT_LENGTH}文字）", max_length=MAX_TEXT_LENGTH)
     with gr.Tab(label="ファイルアップロード"):
+        texts_file = gr.File(label=f"入力テキストファイル（最大{MAX_TEXT_FILE_LINES}行）")
     with gr.Accordion(label="LUXEのエンティティ語彙を置き換える", open=False):
+        new_entity_text_pairs_file = gr.File(
+            label=f"エンティティと説明文のCSVファイル（最大{MAX_ENTITY_FILE_LINES}行）"
+        )
         new_entity_text_pairs_input = gr.Dataframe(
             # value=sample_new_entity_text_pairs,
             headers=["entity", "text"],
         fn=get_new_entity_text_pairs_from_file, inputs=new_entity_text_pairs_file, outputs=new_entity_text_pairs_input
     )
     replace_entity_button.click(
+        fn=replace_entities,
+        inputs=[models, new_entity_text_pairs_input, entity_replaced_counts],
+        outputs=entity_replaced_counts,
     )
     with gr.Accordion(label="ハイパーパラメータ", open=False):
     texts.change(
         fn=get_topk_entities_from_texts,
+        inputs=[models, texts, topk, entity_span_sensitivity, nayose_coef, entity_replaced_counts],
         outputs=[batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
     )
     topk.change(
         fn=get_topk_entities_from_texts,
+        inputs=[models, texts, topk, entity_span_sensitivity, nayose_coef, entity_replaced_counts],
         outputs=[batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
     )
     entity_span_sensitivity.change(
         fn=get_topk_entities_from_texts,
+        inputs=[models, texts, topk, entity_span_sensitivity, nayose_coef, entity_replaced_counts],
         outputs=[batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
     )
     nayose_coef.change(
         fn=get_topk_entities_from_texts,
+        inputs=[models, texts, topk, entity_span_sensitivity, nayose_coef, entity_replaced_counts],
         outputs=[batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
     )
+    entity_replaced_counts.change(
         fn=get_topk_entities_from_texts,
+        inputs=[models, texts, topk, entity_span_sensitivity, nayose_coef, entity_replaced_counts],
         outputs=[batch_entity_spans, topk_normal_entities, topk_category_entities, topk_span_entities],
     )