DocUA commited on
Commit
9c5a6d0
·
0 Parent(s):

Initial commit without sensitive data

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .env
5
+
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SDC Multi Classifier
3
+ emoji: 🦀
4
+ colorFrom: purple
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.13.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import numpy as np
5
+ from typing import Dict, List
6
+
7
+ from openai import OpenAI
8
+ from dotenv import load_dotenv
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+
14
+ # 1) Вкажіть свій OpenAI ключ
15
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
16
+
17
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
18
+
19
+
20
+ ##############################################################################
21
+ # 1. Вихідні дані: JSON із "хінтами"
22
+ ##############################################################################
23
+ classes_json = {
24
+ "Pain": [
25
+ "ache", "aches", "hurts", "pain", "painful", "sore"
26
+ # ...
27
+ ],
28
+ "Chest pain": [
29
+ "aches in my chest", "chest pain", "chest hurts", "sternum pain"
30
+ ],
31
+ "Physical Activity": [
32
+ "exercise", "walking", "running", "biking"
33
+ ],
34
+ "Office visit": [
35
+ "appointment scheduled", "annual checkup", "office visit"
36
+ ],
37
+ # ...
38
+ }
39
+
40
+ ##############################################################################
41
+ # 2. Глобальні змінні (спрощено)
42
+ ##############################################################################
43
+ df = None
44
+ embeddings = None
45
+ class_signatures = None
46
+
47
+ ##############################################################################
48
+ # 3. Функція для завантаження даних
49
+ ##############################################################################
50
+ def load_data(csv_path: str = "messages.csv", emb_path: str = "embeddings.npy"):
51
+ global df, embeddings
52
+ df_local = pd.read_csv(csv_path)
53
+ emb_local = np.load(emb_path)
54
+ assert len(df_local) == len(emb_local), "CSV і embeddings різної довжини!"
55
+
56
+ df_local["Target"] = "Unlabeled"
57
+
58
+ # Нормалізація embeddings
59
+ emb_local = (emb_local - emb_local.mean(axis=0)) / emb_local.std(axis=0)
60
+
61
+ df = df_local
62
+ embeddings = emb_local
63
+
64
+ ##############################################################################
65
+ # 4. Виклик OpenAI для отримання одного embedding
66
+ ##############################################################################
67
+ def get_openai_embedding(text: str, model_name: str = "text-embedding-3-small") -> list:
68
+ response = client.embeddings.create(
69
+ input=text,
70
+ model=model_name
71
+ )
72
+ return response.data[0].embedding
73
+
74
+ ##############################################################################
75
+ # 5. Отримати embeddings для списку фраз (хінтів) і усереднити
76
+ ##############################################################################
77
+ def embed_hints(hint_list: List[str], model_name: str) -> np.ndarray:
78
+ emb_list = []
79
+ for hint in hint_list:
80
+ emb = get_openai_embedding(hint, model_name=model_name)
81
+ emb_list.append(emb)
82
+ return np.array(emb_list, dtype=np.float32)
83
+
84
+ ##############################################################################
85
+ # 6. Будуємо signatures для кожного класу
86
+ ##############################################################################
87
+ def build_class_signatures(model_name: str):
88
+ global class_signatures
89
+ signatures = {}
90
+ for cls_name, hints in classes_json.items():
91
+ if not hints:
92
+ continue
93
+ arr = embed_hints(hints, model_name=model_name)
94
+ signatures[cls_name] = arr.mean(axis=0)
95
+ class_signatures = signatures
96
+ return "Signatures побудовано!"
97
+
98
+ ##############################################################################
99
+ # 7. Функція класифікації одного рядка (dot product)
100
+ ##############################################################################
101
+ def predict_class(text_embedding: np.ndarray, signatures: Dict[str, np.ndarray]) -> str:
102
+ best_label = "Unknown"
103
+ best_score = float("-inf")
104
+ for cls, sign in signatures.items():
105
+ score = np.dot(text_embedding, sign)
106
+ if score > best_score:
107
+ best_score = score
108
+ best_label = cls
109
+ return best_label
110
+
111
+ ##############################################################################
112
+ # 8. Класифікація відфільтрованих рядків
113
+ ##############################################################################
114
+ def classify_rows(filter_substring: str):
115
+ global df, embeddings, class_signatures
116
+
117
+ if class_signatures is None:
118
+ return "Спочатку збудуйте signatures!"
119
+
120
+ if df is None or embeddings is None:
121
+ return "Дані не завантажені! Спочатку викличте load_data."
122
+
123
+ if filter_substring:
124
+ filtered_idx = df[df["Message"].str.contains(filter_substring, case=False, na=False)].index
125
+ else:
126
+ filtered_idx = df.index
127
+
128
+ for i in filtered_idx:
129
+ emb_vec = embeddings[i]
130
+ pred = predict_class(emb_vec, class_signatures)
131
+ df.at[i, "Target"] = pred
132
+
133
+ result_df = df.loc[filtered_idx, ["Message", "Target"]].copy()
134
+ return result_df.reset_index(drop=True)
135
+
136
+ ##############################################################################
137
+ # 9. Збереження CSV
138
+ ##############################################################################
139
+ def save_data():
140
+ global df
141
+ if df is None:
142
+ return "Дані відсутні!"
143
+ df.to_csv("messages_with_labels.csv", index=False)
144
+ return "Файл 'messages_with_labels.csv' збережено!"
145
+
146
+ ##############################################################################
147
+ # 10. Gradio UI
148
+ ##############################################################################
149
+ def ui_load_data(csv_path, emb_path):
150
+ load_data(csv_path, emb_path)
151
+ return f"Data loaded from {csv_path} and {emb_path}. Rows: {len(df)}"
152
+
153
+ def ui_build_signatures(model_name):
154
+ msg = build_class_signatures(model_name)
155
+ return msg
156
+
157
+ def ui_classify_data(filter_substring):
158
+ result = classify_rows(filter_substring)
159
+ if isinstance(result, str):
160
+ return result
161
+ return result
162
+
163
+ def ui_save_data():
164
+ return save_data()
165
+
166
+ def main():
167
+ import gradio as gr
168
+
169
+ with gr.Blocks() as demo:
170
+ gr.Markdown("# SDC Classifier з Gradio")
171
+ gr.Markdown("## 1) Завантаження даних")
172
+
173
+ with gr.Row():
174
+ csv_input = gr.Textbox(value="messages.csv", label="CSV-файл")
175
+ emb_input = gr.Textbox(value="embeddings.npy", label="Numpy Embeddings")
176
+ load_btn = gr.Button("Load data")
177
+
178
+ load_output = gr.Label(label="Loading result")
179
+
180
+ load_btn.click(fn=ui_load_data, inputs=[csv_input, emb_input], outputs=load_output)
181
+
182
+ gr.Markdown("## 2) Побудова Class Signatures")
183
+ # openai_key_in = gr.Textbox(label="OpenAI API Key", type="password")
184
+ model_choice = gr.Dropdown(choices=["text-embedding-3-large","text-embedding-3-small"],
185
+ value="text-embedding-3-small", label="OpenAI model")
186
+ build_btn = gr.Button("Build signatures")
187
+ build_out = gr.Label(label="Signatures")
188
+
189
+ build_btn.click(fn=ui_build_signatures, inputs=[model_choice], outputs=build_out)
190
+
191
+ gr.Markdown("## 3) Класифікація")
192
+ filter_in = gr.Textbox(label="Filter substring (optional)")
193
+ classify_btn = gr.Button("Classify")
194
+ classify_out = gr.Dataframe(label="Result (Message / Target)")
195
+
196
+ classify_btn.click(fn=ui_classify_data, inputs=[filter_in], outputs=[classify_out])
197
+
198
+ gr.Markdown("## 4) Зберегти CSV")
199
+ save_btn = gr.Button("Save labeled data")
200
+ save_out = gr.Label()
201
+
202
+ save_btn.click(fn=ui_save_data, inputs=[], outputs=save_out)
203
+
204
+ gr.Markdown("""
205
+ ### Опис:
206
+ 1. Натисніть 'Load data', щоб завантажити ваші дані (CSV + embeddings).
207
+ 2. Укажіть OpenAI API модель, натисніть 'Build signatures'.
208
+ 3. Вкажіть фільтр (необов'язково), натисніть 'Classify'.
209
+ Отримаєте таблицю з полем Target.
210
+ 4. 'Save labeled data' збереже 'messages_with_labels.csv'.
211
+ """)
212
+
213
+ demo = gr.Blocks(title="SDC Multi Classifier")
214
+
215
+ # demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
216
+ demo.launch()
217
+
218
+ if __name__ == "__main__":
219
+ main()
create_embeddings.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ from openai import OpenAI
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+
12
+
13
+ # 1) Вкажіть свій OpenAI ключ
14
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15
+
16
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
17
+
18
+
19
+ # 2) Задайте назви файлів
20
+ CSV_FILE = "messages_with_labels.csv" # ваш CSV із колонкою "Message"
21
+ OUTPUT_EMB_FILE = "embeddings.npy"
22
+ MODEL_NAME = "text-embedding-3-small" # або іншу модель
23
+
24
+ # 3) Зчитайте CSV
25
+ df = pd.read_csv(CSV_FILE)
26
+ texts = df["Message"].fillna("").tolist() # на випадок, якщо є NaN
27
+
28
+ embeddings_list = []
29
+
30
+ # 4) Викличте OpenAI API для кожного рядка
31
+ for i, text in enumerate(texts):
32
+ # Результат запиту до OpenAI
33
+ response = client.embeddings.create(
34
+ input=text,
35
+ model=MODEL_NAME
36
+ )
37
+ emb = response.data[0].embedding
38
+ embeddings_list.append(emb)
39
+
40
+ # 5) Переведемо список у np.array та збережемо
41
+ embedding_matrix = np.array(embeddings_list, dtype=np.float32)
42
+ np.save(OUTPUT_EMB_FILE, embedding_matrix)
43
+
44
+ print(f"Embeddings saved to {OUTPUT_EMB_FILE} with shape {embedding_matrix.shape}")
embeddings.npy ADDED
Binary file (73.9 kB). View file
 
messages.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Message,Target
2
+ "I have a strong ache in my left arm",Pain
3
+ "My chest hurts sometimes, especially when I breathe deeply",Chest pain
4
+ "Just finished running 3 miles",Physical Activity
5
+ "I scheduled an appointment next week for my annual checkup",Office visit
6
+ "Feel a bit sore in my legs after walking",Pain
7
+ "Went biking for 10 miles this morning",Physical Activity
8
+ "Annual checkup with my doctor is planned",Office visit
9
+ "There's a sternum pain in the center of my chest",Chest pain
10
+ "I'm going to exercise daily",Physical Activity
11
+ "My back is painful when I wake up",Pain
12
+ "I have no health issues right now",Unknown
13
+ "I'm here to schedule an office visit for next month",Office visit
messages_with_labels.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Message,Target
2
+ I have a strong ache in my left arm,Pain
3
+ "My chest hurts sometimes, especially when I breathe deeply",Chest pain
4
+ Just finished running 3 miles,Physical Activity
5
+ I scheduled an appointment next week for my annual checkup,Office visit
6
+ Feel a bit sore in my legs after walking,Pain
7
+ Went biking for 10 miles this morning,Physical Activity
8
+ Annual checkup with my doctor is planned,Office visit
9
+ There's a sternum pain in the center of my chest,Chest pain
10
+ I'm going to exercise daily,Physical Activity
11
+ My back is painful when I wake up,Chest pain
12
+ I have no health issues right now,Physical Activity
13
+ I'm here to schedule an office visit for next month,Office visit
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ openai
3
+ pandas
4
+ numpy
5
+ python-dotenv
test_messages.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def test_messages_with_labels(path_csv="messages_with_labels.csv"):
4
+ # 1) Завантажуємо CSV
5
+ df_labeled = pd.read_csv(path_csv)
6
+
7
+ # 2) Подивимося на перші 5 рядків
8
+ print("Перші 5 рядків з messages_with_labels.csv:")
9
+ print(df_labeled.head())
10
+
11
+ # 3) Порахуємо, скільки в кожному класі (Target)
12
+ print("\nРозподіл за мітками (Target):")
13
+ print(df_labeled["Target"].value_counts())
14
+
15
+ # (Додатково) Якщо у вас є справжня колонка, напр. "TrueLabel", можна порахувати Accuracy
16
+ if "TrueLabel" in df_labeled.columns:
17
+ accuracy = (df_labeled["Target"] == df_labeled["TrueLabel"]).mean()
18
+ print(f"\nAccuracy (Target vs TrueLabel): {accuracy:.2%}")
19
+ else:
20
+ print("\nКолонка 'TrueLabel' відсутня — не можемо автоматично оцінити точність.")
21
+
22
+ # Викликаємо:
23
+ if __name__ == "__main__":
24
+ test_messages_with_labels()