cnmoro commited on
Commit
272be10
·
verified ·
1 Parent(s): deb9717

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +125 -0
README.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ - pt
6
+ base_model:
7
+ - cnmoro/tangled-llama-33m-32k-instruct-v0.1-fix
8
+ pipeline_tag: text-classification
9
+ ---
10
+
11
+ ```python
12
+ from tokenizers import Tokenizer
13
+ import onnxruntime as ort
14
+ import numpy as np
15
+
16
+ reranker_tokenizer = Tokenizer.from_file('./tokenizer.json')
17
+ reranker_session = ort.InferenceSession('./model.onnx')
18
+
19
+ def rerank(question, passages, normalize_scores=True):
20
+ # Format input templates
21
+ templates = [f"Query: {question}\nSentence: {passage}" for passage in passages]
22
+ encoded_inputs = reranker_tokenizer.encode_batch(templates)
23
+
24
+ # Convert to lists and truncate sequences to max length (512)
25
+ input_ids = [enc.ids[:512] for enc in encoded_inputs] # Truncate here
26
+ attention_mask = [[1] * len(ids) for ids in input_ids]
27
+ # token_type_ids = [[0] * len(ids) for ids in input_ids]
28
+
29
+ # Find max length in batch
30
+ batch_max_length = max(len(ids) for ids in input_ids) # Already truncated to <=512
31
+
32
+ # Pad sequences
33
+ def pad_sequence(seq, pad_value=0):
34
+ return seq + [pad_value] * (batch_max_length - len(seq))
35
+
36
+ input_ids = np.array([pad_sequence(ids) for ids in input_ids], dtype=np.int64)
37
+ attention_mask = np.array([pad_sequence(mask, pad_value=0) for mask in attention_mask], dtype=np.int64)
38
+ # token_type_ids = np.array([pad_sequence(types, pad_value=0) for types in token_type_ids], dtype=np.int64)
39
+
40
+ # Create ONNX input dict
41
+ inputs_onnx = {
42
+ "input_ids": input_ids,
43
+ "attention_mask": attention_mask,
44
+ # "token_type_ids": token_type_ids
45
+ }
46
+
47
+ # Run ONNX model
48
+ outputs = reranker_session.run(None, inputs_onnx)
49
+ logits = outputs[0]
50
+
51
+ # Apply softmax to get probabilities
52
+ probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
53
+
54
+ # Get predicted class and confidence score
55
+ predicted_classes = np.argmax(probabilities, axis=1).tolist()
56
+ confidences = np.max(probabilities, axis=1).tolist()
57
+
58
+ results = [
59
+ {"passage": passage, "prediction": pred, "confidence": conf}
60
+ for passage, pred, conf in zip(passages, predicted_classes, confidences)
61
+ ]
62
+
63
+ final_results = []
64
+ for document, result in zip(passages, results):
65
+ # If the prediction is 0, adjust the confidence score
66
+ if result['prediction'] == 0:
67
+ result['confidence'] = 1 - result['confidence']
68
+ final_results.append((document, result['confidence']))
69
+
70
+ # Sort by confidence score in descending order
71
+ sorted_results = sorted(final_results, key=lambda x: x[1], reverse=True)
72
+
73
+ # Normalize scores if required
74
+ if normalize_scores:
75
+ total_score = sum(result[1] for result in sorted_results)
76
+ if total_score > 0:
77
+ sorted_results = [(result[0], result[1] / total_score) for result in sorted_results]
78
+
79
+ return sorted_results
80
+
81
+ question = "O que é o Pantanal?"
82
+ passages = [
83
+ "É um dos ecossistemas mais ricos em biodiversidade do mundo, abrigando uma grande variedade de espécies animais e vegetais.",
84
+ "Sua beleza natural, com rios e lagos interligados, atrai turistas de todo o mundo.",
85
+ "O Pantanal sofre com impactos ambientais, como a exploração mineral e o desmatamento.",
86
+ "O Pantanal é uma extensa planície alagável localizada na América do Sul, principalmente no Brasil, mas também em partes da Bolívia e Paraguai.",
87
+ "É um local com importância histórica e cultural para as populações locais.",
88
+ "O Pantanal é um importante habitat para diversas espécies de animais, inclusive aves migratórias."
89
+ ]
90
+ ranked_results = rerank(question, passages, normalize_scores=True)
91
+ ranked_results
92
+ # [('O Pantanal é uma extensa planície alagável localizada na América do Sul, principalmente no Brasil, mas também em partes da Bolívia e Paraguai.',
93
+ # 0.7105862286443647),
94
+ # ('O Pantanal é um importante habitat para diversas espécies de animais, inclusive aves migratórias.',
95
+ # 0.22660008031497725),
96
+ # ('O Pantanal sofre com impactos ambientais, como a exploração mineral e o desmatamento.',
97
+ # 0.043374300040060654),
98
+ # ('É um local com importância histórica e cultural para as populações locais.',
99
+ # 0.0070428120274147726),
100
+ # ('É um dos ecossistemas mais ricos em biodiversidade do mundo, abrigando uma grande variedade de espécies animais e vegetais.',
101
+ # 0.006359544027065005),
102
+ # ('Sua beleza natural, com rios e lagos interligados, atrai turistas de todo o mundo.',
103
+ # 0.006037034946117598)]
104
+
105
+ question = "What is the speed of light?"
106
+ passages = [
107
+ "Isaac Newton's laws of motion and gravity laid the groundwork for classical mechanics.",
108
+ "The theory of relativity, proposed by Albert Einstein, has revolutionized our understanding of space, time, and gravity.",
109
+ "The Earth orbits the Sun at an average distance of about 93 million miles, taking roughly 365.25 days to complete one revolution.",
110
+ "The speed of light in a vacuum is approximately 299,792 kilometers per second (km/s), or about 186,282 miles per second.",
111
+ "Light can be described as both a wave and a particle, a concept known as wave-particle duality."
112
+ ]
113
+ ranked_results = rerank(question, passages, normalize_scores=True)
114
+ ranked_results
115
+ # [('The speed of light in a vacuum is approximately 299,792 kilometers per second (km/s), or about 186,282 miles per second.',
116
+ # 0.5686758878772575),
117
+ # ('The theory of relativity, proposed by Albert Einstein, has revolutionized our understanding of space, time, and gravity.',
118
+ # 0.14584055128478327),
119
+ # ('The Earth orbits the Sun at an average distance of about 93 million miles, taking roughly 365.25 days to complete one revolution.',
120
+ # 0.13790743024424898),
121
+ # ("Isaac Newton's laws of motion and gravity laid the groundwork for classical mechanics.",
122
+ # 0.08071345159269593),
123
+ # ('Light can be described as both a wave and a particle, a concept known as wave-particle duality.',
124
+ # 0.06686267900101434)]
125
+ ```