Spaces:

studio-ousia
/

luxe-demo

Running on CPU Upgrade

App Files Files Community

singletongue commited on Jan 20

Commit

c5df237

verified ·

1 Parent(s): 647335c

Use ja-v0.2 model, ignore categories of some patterns

Browse files

Files changed (1) hide show

app.py +23 -1

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from pathlib import Path
 import gradio as gr
@@ -7,7 +8,18 @@ from transformers import AutoModelForPreTraining, AutoTokenizer
 repo_id = "studio-ousia/luxe"
-revision = "ja-v0.1"
 model = AutoModelForPreTraining.from_pretrained(repo_id, revision=revision, trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(repo_id, revision=revision, trust_remote_code=True)
@@ -24,6 +36,11 @@ id2category_entity = {
     for entity, entity_id in tokenizer.entity_vocab.items()
     if entity_id >= num_normal_entities
 }
 entity_embeddings = model.luke.entity_embeddings.entity_embeddings.weight
 normal_entity_embeddings = entity_embeddings[:num_normal_entities]
@@ -94,6 +111,8 @@ def get_topk_entities_from_texts(
         tokenized_examples = tokenizer(text, entity_spans=noun_spans, return_tensors="pt")
         model_outputs = model(**tokenized_examples)
         _, topk_normal_entity_ids = model_outputs.topic_entity_logits[0].topk(k)
         topk_normal_entities.append([id2normal_entity[id_] for id_ in topk_normal_entity_ids.tolist()])
@@ -120,6 +139,9 @@ def get_similar_entities(query_entity: str, k: int = 10) -> list[str]:
     else:
         query_entity_id -= num_normal_entities
         topk_entity_scores = category_entity_embeddings[query_entity_id] @ category_entity_embeddings.T
         topk_entity_ids = topk_entity_scores.topk(k + 1).indices[1:]
         topk_entities = [id2category_entity[entity_id] for entity_id in topk_entity_ids.tolist()]

+import re
 from pathlib import Path
 import gradio as gr
 repo_id = "studio-ousia/luxe"
+revision = "ja-v0.2"
+ignore_category_patterns = [
+    r"\d+年",
+    r"楽曲 [ぁ-ん]",
+    r"漫画作品 [ぁ-ん]",
+    r"アニメ作品 [ぁ-ん]",
+    r"アニメ作品 [ぁ-ん]",
+    r"の一覧",
+    r"各国の",
+    r"各年の",
+]
 model = AutoModelForPreTraining.from_pretrained(repo_id, revision=revision, trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(repo_id, revision=revision, trust_remote_code=True)
     for entity, entity_id in tokenizer.entity_vocab.items()
     if entity_id >= num_normal_entities
 }
+ignore_category_entity_ids = [
+    entity_id - num_normal_entities
+    for entity, entity_id in tokenizer.entity_vocab.items()
+    if entity_id >= num_normal_entities and any(re.search(pattern, entity) for pattern in ignore_category_patterns)
+]
 entity_embeddings = model.luke.entity_embeddings.entity_embeddings.weight
 normal_entity_embeddings = entity_embeddings[:num_normal_entities]
         tokenized_examples = tokenizer(text, entity_spans=noun_spans, return_tensors="pt")
         model_outputs = model(**tokenized_examples)
+        model_outputs.topic_category_logits[:, ignore_category_entity_ids] = float("-inf")
         _, topk_normal_entity_ids = model_outputs.topic_entity_logits[0].topk(k)
         topk_normal_entities.append([id2normal_entity[id_] for id_ in topk_normal_entity_ids.tolist()])
     else:
         query_entity_id -= num_normal_entities
         topk_entity_scores = category_entity_embeddings[query_entity_id] @ category_entity_embeddings.T
+        topk_entity_scores[ignore_category_entity_ids] = float("-inf")
         topk_entity_ids = topk_entity_scores.topk(k + 1).indices[1:]
         topk_entities = [id2category_entity[entity_id] for entity_id in topk_entity_ids.tolist()]