bintangyosua commited on
Commit
71f46e1
·
verified ·
1 Parent(s): 9410616

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +679 -679
app.py CHANGED
@@ -1,679 +1,679 @@
1
- import marimo
2
-
3
- __generated_with = "0.9.15"
4
- app = marimo.App(width="full")
5
-
6
-
7
- @app.cell(hide_code=True)
8
- def __(mo):
9
- mo.md(
10
- """
11
- # Political Ideologies Analysis
12
-
13
- This project provides a detailed analysis of political ideologies using data from the Huggingface Political Ideologies dataset. The code leverages various data science libraries and visualization tools to map, analyze, and visualize political ideology text data.
14
- Project Structure
15
-
16
- This analysis is based on huggingface dataset repository. <br>
17
- You can visit right [here](https://huggingface.co/datasets/JyotiNayak/political_ideologies)
18
- """
19
- )
20
- return
21
-
22
-
23
- @app.cell(hide_code=True)
24
- def __(form, mo, try_predict):
25
- text_classified = 'Please write something'
26
- if (form.value):
27
- text_classified = try_predict(form.value)
28
- mo.vstack([form, mo.md(f"Your Opinion Classified as: **{text_classified}**")])
29
- return (text_classified,)
30
-
31
-
32
- @app.cell(hide_code=True)
33
- def __():
34
- import os
35
-
36
- import marimo as mo
37
- import pandas as pd
38
- import numpy as np
39
- import random
40
-
41
- import matplotlib.pyplot as plt
42
- import seaborn as sns
43
- import altair as alt
44
-
45
- from gensim.models import Word2Vec
46
- from sklearn.manifold import TSNE
47
- from umap import UMAP
48
-
49
- import tensorflow as tf
50
- from tensorflow.keras.models import Sequential
51
- from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
52
-
53
- import re
54
- import string
55
-
56
- from gensim.models import FastText
57
- from wordcloud import WordCloud
58
- from nltk.corpus import stopwords
59
- from nltk.tokenize import word_tokenize
60
- from nltk.stem import WordNetLemmatizer
61
- from nltk.stem.porter import PorterStemmer
62
- from tensorflow.keras.preprocessing.text import Tokenizer
63
- from tensorflow.keras.preprocessing.sequence import pad_sequences
64
- from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
65
- from sklearn.model_selection import train_test_split
66
- from sklearn.metrics import accuracy_score, classification_report
67
- import joblib
68
-
69
- import nltk
70
-
71
- mo.md("""
72
- ## 1. Import all libraries needed
73
-
74
- The initial cells import the necessary libraries for data handling, visualization, and word embedding.
75
- """)
76
- return (
77
- Bidirectional,
78
- Dense,
79
- EarlyStopping,
80
- Embedding,
81
- FastText,
82
- LSTM,
83
- PorterStemmer,
84
- ReduceLROnPlateau,
85
- Sequential,
86
- TSNE,
87
- Tokenizer,
88
- UMAP,
89
- Word2Vec,
90
- WordCloud,
91
- WordNetLemmatizer,
92
- accuracy_score,
93
- alt,
94
- classification_report,
95
- joblib,
96
- mo,
97
- nltk,
98
- np,
99
- os,
100
- pad_sequences,
101
- pd,
102
- plt,
103
- random,
104
- re,
105
- sns,
106
- stopwords,
107
- string,
108
- tf,
109
- train_test_split,
110
- word_tokenize,
111
- )
112
-
113
-
114
- @app.cell(hide_code=True)
115
- def __():
116
- return
117
-
118
-
119
- @app.cell(hide_code=True)
120
- def __(mo):
121
- mo.md(
122
- """
123
- Here are the mapped of label and issue type columns.
124
-
125
- ```yaml
126
- Label Mapping: {'conservative': 0, 'liberal': 1 }
127
- Issue Type Mapping: {
128
- 'economic': 0, 'environmental': 1,
129
- 'family/gender': 2, 'geo-political and foreign policy': 3,
130
- 'political': 4, 'racial justice and immigration': 5,
131
- 'religious': 6, 'social, health and education': 7
132
- }
133
- ```
134
- """
135
- )
136
- return
137
-
138
-
139
- @app.cell(hide_code=True)
140
- def __(mo, pd):
141
- df = pd.read_parquet('train.parquet')
142
- df_val = pd.read_parquet('val.parquet')
143
- df_test = pd.read_parquet('test.parquet')
144
-
145
- df = df.drop('__index_level_0__', axis=1)
146
-
147
- mo.md("""
148
- ## 2. Dataset Loading
149
-
150
- The dataset files (`train.parquet`, `val.parquet`, and `test.parquet`) are loaded, concatenated, and cleaned to form a single DataFrame (df). Columns are mapped to readable labels for ease of understanding.
151
- """)
152
- return df, df_test, df_val
153
-
154
-
155
- @app.cell(hide_code=True)
156
- def __():
157
- label_mapping = {
158
- 'conservative': 0,
159
- 'liberal': 1
160
- }
161
-
162
- issue_type_mapping = {
163
- 'economic': 0,
164
- 'environmental': 1,
165
- 'family/gender': 2,
166
- 'geo-political and foreign policy': 3,
167
- 'political': 4,
168
- 'racial justice and immigration': 5,
169
- 'religious': 6,
170
- 'social, health and education': 7
171
- }
172
- return issue_type_mapping, label_mapping
173
-
174
-
175
- @app.cell(hide_code=True)
176
- def __(issue_type_mapping, label_mapping):
177
- label_mapping_reversed = {v: k for k, v in label_mapping.items()}
178
- issue_type_mapping_reversed = {v: k for k, v in issue_type_mapping.items()}
179
-
180
- print(label_mapping_reversed)
181
- print(issue_type_mapping_reversed)
182
- return issue_type_mapping_reversed, label_mapping_reversed
183
-
184
-
185
- @app.cell(hide_code=True)
186
- def __(df, issue_type_mapping_reversed, label_mapping_reversed, mo):
187
- df['label_text'] = df['label'].replace(label_mapping_reversed)
188
- df['issue_type_text'] = df['issue_type'].replace(issue_type_mapping_reversed)
189
-
190
- labels_grouped = df['label_text'].value_counts().rename_axis('label_text').reset_index(name='counts')
191
- issue_types_grouped = (
192
- df["issue_type_text"]
193
- .value_counts()
194
- .rename_axis("issue_type_text")
195
- .reset_index(name="counts")
196
- )
197
-
198
- mo.md("""
199
- ## 3. Mapping Labels and Issue Types
200
-
201
- Two dictionaries map labels (conservative and liberal) and issue types (e.g., economic, environmental, etc.) to numerical values for machine learning purposes. Reversed mappings are created to convert numerical labels back into their text form.
202
- """)
203
- return issue_types_grouped, labels_grouped
204
-
205
-
206
- @app.cell(hide_code=True)
207
- def __(df):
208
- df.iloc[:, :6].head(7)
209
- return
210
-
211
-
212
- @app.cell(hide_code=True)
213
- def __(mo):
214
- mo.md(
215
- """
216
- ## 4. Visualizing Data Distributions
217
-
218
- Bar plots visualize the proportions of conservative vs. liberal ideologies and the count of different issue types. These provide an overview of the dataset composition.
219
- """
220
- )
221
- return
222
-
223
-
224
- @app.cell(hide_code=True)
225
- def __(alt, labels_grouped, mo):
226
- mo.ui.altair_chart(
227
- alt.Chart(labels_grouped).mark_bar(
228
- fill='#4C78A8',
229
- cursor='pointer',
230
- ).encode(
231
- x=alt.X('label_text', axis=alt.Axis(labelAngle=0)),
232
- y='counts:Q'
233
- )
234
- )
235
- return
236
-
237
-
238
- @app.cell(hide_code=True)
239
- def __(alt, issue_types_grouped, mo):
240
- mo.ui.altair_chart(
241
- alt.Chart(issue_types_grouped)
242
- .mark_bar(
243
- fill="#4C78A8",
244
- cursor="pointer",
245
- )
246
- .encode(
247
- x=alt.X(
248
- "issue_type_text:O",
249
- axis=alt.Axis(
250
- labelAngle=-10, labelAlign="center", labelPadding=10
251
- ),
252
- ),
253
- y="counts:Q",
254
- )
255
- )
256
- return
257
-
258
-
259
- @app.cell(hide_code=True)
260
- def __(mo):
261
- mo.md(
262
- r"""
263
- ## 5. Text Preprocessing
264
-
265
- Texts preprocessed to remove any ineffective words.
266
- """
267
- )
268
- return
269
-
270
-
271
- @app.cell(hide_code=True)
272
- def __(WordCloud, df):
273
- all_text = ''.join(df['statement'])
274
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
275
- return all_text, wordcloud
276
-
277
-
278
- @app.cell(hide_code=True)
279
- def __(plt, wordcloud):
280
- plt.figure(figsize=(10, 5))
281
- plt.imshow(wordcloud, interpolation='bilinear')
282
- plt.axis='off'
283
- plt.plot()
284
- plt.gca()
285
- return
286
-
287
-
288
- @app.cell(hide_code=True)
289
- def __(WordNetLemmatizer, nltk, stopwords):
290
- nltk.download('punkt')
291
- nltk.download('stopwords')
292
- nltk.download('wordnet')
293
-
294
- lemmatizer = WordNetLemmatizer()
295
- stop_words = set(stopwords.words('english'))
296
- return lemmatizer, stop_words
297
-
298
-
299
- @app.cell(hide_code=True)
300
- def __(lemmatizer, re, stop_words, word_tokenize):
301
- # Function for preprocessing text
302
- def preprocess_text(text):
303
- # 1. Lowercase the text
304
- text = text.lower()
305
-
306
- # 2. Remove punctuation and non-alphabetical characters
307
- text = re.sub(r'[^a-z\s]', '', text)
308
-
309
- # 3. Tokenize the text
310
- tokens = word_tokenize(text)
311
-
312
- # 4. Remove stopwords and lemmatize each token
313
- processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
314
-
315
- return processed_tokens
316
- return (preprocess_text,)
317
-
318
-
319
- @app.cell(hide_code=True)
320
- def __(df, df_test, df_val, preprocess_text):
321
- # Terapkan fungsi preprocessing pada kolom 'statement'
322
- df['processed_statement'] = df['statement'].apply(preprocess_text)
323
- df_val['processed_statement'] = df_val['statement'].apply(preprocess_text)
324
- df_test['processed_statement'] = df_test['statement'].apply(preprocess_text)
325
- processed_statement = df['processed_statement']
326
- return (processed_statement,)
327
-
328
-
329
- @app.cell(hide_code=True)
330
- def __(mo):
331
- mo.md(r"""## 6. Word Embeddings""")
332
- return
333
-
334
-
335
- @app.cell(hide_code=True)
336
- def __(np):
337
- def get_doc_embedding(tokens, embeddings_model):
338
- vectors = [embeddings_model.wv[word] for word in tokens if word in embeddings_model.wv]
339
- if vectors:
340
- return np.mean(vectors, axis=0)
341
- else:
342
- return np.zeros(embeddings_model.vector_size)
343
- return (get_doc_embedding,)
344
-
345
-
346
- @app.cell(hide_code=True)
347
- def __(FastText, Word2Vec, processed_statement):
348
- embedding_models = {
349
- 'fasttext': FastText(sentences=processed_statement, vector_size=100, window=3, min_count=1, seed=0, workers=1),
350
- 'word2vec': Word2Vec(sentences=processed_statement, vector_size=100, window=3, min_count=1, seed=0, workers=1)
351
- }
352
- return (embedding_models,)
353
-
354
-
355
- @app.cell(hide_code=True)
356
- def __(mo):
357
- mo.md(r"""### 6.1 Word Embedding using FastText and Word2Vec""")
358
- return
359
-
360
-
361
- @app.cell(hide_code=True)
362
- def __(mo):
363
- mo.md(
364
- """
365
- #### Dimensionality Reduction using UMAP
366
-
367
- Embeddings are projected into a 2D space using UMAP for visualization. The embeddings are colored by issue type, showing clusters of similar statements.
368
-
369
- Interactive scatter plots in Altair show ideology and issue types in 2D space. A brush selection tool allows users to explore specific points and view tooltip information.
370
-
371
- #### Combined Scatter Plot
372
-
373
- Combines the two scatter plots into a side-by-side visualization for direct comparison of ideologies vs. issue types.
374
- Running the Code
375
-
376
- Run the code using the marimo.App instance. This notebook can also be run as a standalone Python script:
377
- """
378
- )
379
- return
380
-
381
-
382
- @app.cell(hide_code=True)
383
- def __(UMAP, alt, df, mo, np):
384
- def word_embedding_2d(embedding_model, embedding_model_name):
385
- embeddings_matrix = np.vstack(df[f'embeddings_{embedding_model_name}'].values)
386
-
387
- umap = UMAP(n_components=2, random_state=42)
388
- umap_results = umap.fit_transform(embeddings_matrix)
389
-
390
- df[f'{embedding_model_name}_x'] = umap_results[:, 0]
391
- df[f'{embedding_model_name}_y'] = umap_results[:, 1]
392
-
393
- brush = alt.selection_interval()
394
- size = 350
395
-
396
- points1 = alt.Chart(df, height=size, width=size).mark_point().encode(
397
- x=f'{embedding_model_name}_x:Q',
398
- y=f'{embedding_model_name}_y:Q',
399
- color=alt.condition(brush, 'label_text', alt.value('grey')),
400
- tooltip=[f'{embedding_model_name}_x:Q', f'{embedding_model_name}_y:Q', 'statement:N', 'label_text:N']
401
- ).add_params(brush).properties(title='By Political Ideologies')
402
-
403
- scatter_chart1 = mo.ui.altair_chart(points1)
404
-
405
- points2 = alt.Chart(df, height=size, width=size).mark_point().encode(
406
- x=f'{embedding_model_name}_x:Q',
407
- y=f'{embedding_model_name}_y:Q',
408
- color=alt.condition(brush, 'issue_type_text', alt.value('grey')),
409
- tooltip=[f'{embedding_model_name}_x:Q', f'{embedding_model_name}_y:Q', 'statement:N', 'issue_type:N']
410
- ).add_params(brush).properties(title='By Issue Types')
411
-
412
- scatter_chart2 = mo.ui.altair_chart(points2)
413
-
414
- combined_chart = (scatter_chart1 | scatter_chart2)
415
- return combined_chart
416
- return (word_embedding_2d,)
417
-
418
-
419
- @app.cell(hide_code=True)
420
- def __(
421
- df,
422
- df_test,
423
- df_val,
424
- embedding_models,
425
- get_doc_embedding,
426
- word_embedding_2d,
427
- ):
428
- for name, embedding_model in embedding_models.items():
429
- df['embeddings_' + name] = df['processed_statement'].apply(lambda x: get_doc_embedding(x, embedding_model))
430
- df_val['embeddings_' + name] = df_val['processed_statement'].apply(lambda x: get_doc_embedding(x, embedding_model))
431
- df_test['embeddings_' + name] = df_test['processed_statement'].apply(lambda x: get_doc_embedding(x, embedding_model))
432
-
433
- fasttext_plot = word_embedding_2d(embedding_models['fasttext'], 'fasttext')
434
- word2vec_plot = word_embedding_2d(embedding_models['word2vec'], 'word2vec')
435
-
436
- test_embeddings_fasttext = df_test['embeddings_fasttext']
437
- return (
438
- embedding_model,
439
- fasttext_plot,
440
- name,
441
- test_embeddings_fasttext,
442
- word2vec_plot,
443
- )
444
-
445
-
446
- @app.cell(hide_code=True)
447
- def __(fasttext_plot, mo):
448
- fasttext_table = fasttext_plot.value[['statement', 'label_text', 'issue_type_text']]
449
- fasttext_chart = mo.vstack([
450
- fasttext_plot,
451
- fasttext_table
452
- ])
453
- return fasttext_chart, fasttext_table
454
-
455
-
456
- @app.cell(hide_code=True)
457
- def __(mo, word2vec_plot):
458
- word2vec_table = word2vec_plot.value[['statement', 'label_text', 'issue_type_text']]
459
- word2vec_chart = mo.vstack([
460
- word2vec_plot,
461
- word2vec_table
462
- ])
463
- return word2vec_chart, word2vec_table
464
-
465
-
466
- @app.cell(hide_code=True)
467
- def __(fasttext_chart, mo, word2vec_chart):
468
- mo.ui.tabs({
469
- 'FastText': fasttext_chart,
470
- 'Word2Vec': word2vec_chart
471
- })
472
- return
473
-
474
-
475
- @app.cell(hide_code=True)
476
- def __(mo):
477
- mo.md(
478
- r"""
479
- ## Data Insights
480
-
481
- - Ideology Distribution: Visualizes proportions of conservative and liberal ideologies.
482
- - Issue Types: Bar plot reveals the diversity and frequency of issue types in the dataset.
483
- - Word Embeddings: Using UMAP for 2D projections helps identify clusters in political statements.
484
- - Interactive Exploration: Offers detailed, interactive views on ideology vs. issue type distribution.
485
-
486
- This code provides a thorough analysis pipeline, from data loading to interactive visualizations, enabling an in-depth exploration of political ideologies.
487
- """
488
- )
489
- return
490
-
491
-
492
- @app.cell(hide_code=True)
493
- def __(mo):
494
- mo.md(
495
- r"""
496
- ## Building Model
497
-
498
- ```python
499
- clf_model = Sequential()
500
- clf_model.add(Bidirectional(tf.keras.layers.GRU(64,
501
- activation='relu',
502
- # return_sequences=True,
503
- input_shape=(sent_length, input_dim),
504
- kernel_regularizer=tf.keras.regularizers.l2(0.001)))) # L2 regularization
505
- clf_model.add(tf.keras.layers.Dropout(0.5))
506
- clf_model.add(Dense(2,
507
- activation='softmax',
508
- kernel_regularizer=tf.keras.regularizers.l2(0.001))) # L2 regularization in the Dense layer
509
- ```
510
- """
511
- )
512
- return
513
-
514
-
515
- @app.cell(hide_code=True)
516
- def __(df_test, np, test_embeddings_fasttext):
517
- # X_train = np.array(df['embeddings_fasttext'].tolist())
518
- # X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
519
- # y_train = df['label'].values
520
-
521
- # X_val = np.array(df_val['embeddings_fasttext'].tolist())
522
- # X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
523
- # y_val = df_val['label'].values
524
-
525
- X_test = np.array(test_embeddings_fasttext.tolist())
526
- X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
527
- y_test = df_test['label'].values
528
- return X_test, y_test
529
-
530
-
531
- @app.cell(hide_code=True)
532
- def __():
533
- # all_tokens = [token for tokens in df['processed_statement'] for token in tokens]
534
- # vocab_size = len(set(all_tokens))
535
- # vocab_size
536
- # input_dim = X_train.shape[1] # Dimensi dari embedding yang digunakan (misalnya 50 atau 100)
537
- # sent_length = X_train.shape[1] # Ukuran dimensi per embedding
538
-
539
- # input_dim, sent_length
540
- return
541
-
542
-
543
- @app.cell(hide_code=True)
544
- def __():
545
- # seed_value = 345
546
- # np.random.seed(seed_value)
547
- # random.seed(seed_value)
548
- # tf.random.set_seed(seed_value)
549
-
550
- # clf_model = Sequential()
551
- # clf_model.add(Bidirectional(tf.keras.layers.GRU(64,
552
- # activation='relu',
553
- # # return_sequences=True,
554
- # input_shape=(sent_length, input_dim),
555
- # kernel_regularizer=tf.keras.regularizers.l2(0.001)))) # L2 regularization
556
- # clf_model.add(tf.keras.layers.Dropout(0.5))
557
- # clf_model.add(Dense(2,
558
- # activation='softmax',
559
- # kernel_regularizer=tf.keras.regularizers.l2(0.001))) # L2 regularization in the Dense layer
560
-
561
- # clf_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
562
- # clf_model.summary()
563
- return
564
-
565
-
566
- @app.cell(hide_code=True)
567
- def __():
568
- # lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-10)
569
-
570
- # model_history = clf_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=16, verbose=2, callbacks=[lr_scheduler])
571
- return
572
-
573
-
574
- @app.cell(hide_code=True)
575
- def __():
576
- # clf_model.save('models/model_8812.keras')
577
- # joblib.dump(model_history, 'history/history_model_8812.pkl')
578
- return
579
-
580
-
581
- @app.cell(hide_code=True)
582
- def __():
583
- # loaded_model = tf.keras.models.load_model('models/model_8812.keras')
584
- # model_history_loaded = joblib.load('history/history_model_8812.pkl')
585
-
586
- # loaded_model = clf_model
587
- # model_history_loaded = model_history
588
- return
589
-
590
-
591
- @app.cell(hide_code=True)
592
- def __(model_history_loaded, pd):
593
- history_data = {
594
- 'epoch': range(1, len(model_history_loaded.history['accuracy']) + 1),
595
- 'accuracy': model_history_loaded.history['accuracy'],
596
- 'val_accuracy': model_history_loaded.history['val_accuracy'],
597
- 'loss': model_history_loaded.history['loss'],
598
- 'val_loss': model_history_loaded.history['val_loss']
599
- }
600
-
601
- history_df = pd.DataFrame(history_data)
602
- return history_data, history_df
603
-
604
-
605
- @app.cell(hide_code=True)
606
- def __(alt, history_df, mo):
607
- accuracy_chart = alt.Chart(history_df).transform_fold(
608
- ['accuracy', 'val_accuracy'],
609
- as_=['type', 'accuracy']
610
- ).mark_line().encode(
611
- x='epoch:Q',
612
- y='accuracy:Q',
613
- color='type:N',
614
- tooltip=['epoch', 'accuracy']
615
- ).properties(title='Training and Validation Accuracy')
616
-
617
- loss_chart = alt.Chart(history_df).transform_fold(
618
- ['loss', 'val_loss'],
619
- as_=['type', 'loss']
620
- ).mark_line().encode(
621
- x='epoch:Q',
622
- y='loss:Q',
623
- color='type:N',
624
- tooltip=['epoch', 'loss']
625
- ).properties(title='Training and Validation Loss')
626
-
627
- mo.hstack([accuracy_chart | loss_chart])
628
- return accuracy_chart, loss_chart
629
-
630
-
631
- @app.cell(hide_code=True)
632
- def __(X_test, loaded_model, np):
633
- y_pred = loaded_model.predict(X_test)
634
- y_pred = np.argmax(y_pred, axis=1)
635
- return (y_pred,)
636
-
637
-
638
- @app.cell(hide_code=True)
639
- def __(accuracy_score, mo, y_pred, y_test):
640
- mo.md(f"Accuracy score: **{round(accuracy_score(y_test, y_pred) * 100, 2)}**%")
641
- return
642
-
643
-
644
- @app.cell(hide_code=True)
645
- def __(classification_report, mo, y_pred, y_test):
646
- with mo.redirect_stdout():
647
- print(classification_report(y_test, y_pred))
648
- return
649
-
650
-
651
- @app.cell(hide_code=True)
652
- def __(embedding_models, get_doc_embedding, loaded_model, preprocess_text):
653
- def try_predict(text):
654
- tokenized = preprocess_text(text)
655
- embedded = get_doc_embedding(tokenized, embedding_models['fasttext'])
656
- embedded = embedded.reshape(1, 1, -1)
657
- prediction = loaded_model.predict(embedded)
658
- predicted_class = prediction.argmax(axis=-1)
659
- predicted_class = "Progressive" if predicted_class == 1 else "Conservative"
660
- return predicted_class
661
- return (try_predict,)
662
-
663
-
664
- @app.cell(hide_code=True)
665
- def __():
666
- def validate(value):
667
- if len(value.split()) < 15:
668
- return 'Please enter more than 15 words.'
669
- return (validate,)
670
-
671
-
672
- @app.cell(hide_code=True)
673
- def __(mo, validate):
674
- form = mo.ui.text_area(placeholder="...").form(validate=validate)
675
- return (form,)
676
-
677
-
678
- if __name__ == "__main__":
679
- app.run()
 
1
+ import marimo
2
+
3
+ __generated_with = "0.9.15"
4
+ app = marimo.App(width="full")
5
+
6
+
7
+ @app.cell(hide_code=True)
8
+ def __(mo):
9
+ mo.md(
10
+ """
11
+ # Political Ideologies Analysis
12
+
13
+ This project provides a detailed analysis of political ideologies using data from the Huggingface Political Ideologies dataset. The code leverages various data science libraries and visualization tools to map, analyze, and visualize political ideology text data.
14
+ Project Structure
15
+
16
+ This analysis is based on huggingface dataset repository. <br>
17
+ You can visit right [here](https://huggingface.co/datasets/JyotiNayak/political_ideologies)
18
+ """
19
+ )
20
+ return
21
+
22
+
23
+ @app.cell(hide_code=True)
24
+ def __(form, mo, try_predict):
25
+ text_classified = 'Please write something'
26
+ if (form.value):
27
+ text_classified = try_predict(form.value)
28
+ mo.vstack([form, mo.md(f"Your Opinion Classified as: **{text_classified}**")])
29
+ return (text_classified,)
30
+
31
+
32
+ @app.cell(hide_code=True)
33
+ def __():
34
+ import os
35
+
36
+ import marimo as mo
37
+ import pandas as pd
38
+ import numpy as np
39
+ import random
40
+
41
+ import matplotlib.pyplot as plt
42
+ import seaborn as sns
43
+ import altair as alt
44
+
45
+ from gensim.models import Word2Vec
46
+ from sklearn.manifold import TSNE
47
+ from umap import UMAP
48
+
49
+ import tensorflow as tf
50
+ from tensorflow.keras.models import Sequential
51
+ from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
52
+
53
+ import re
54
+ import string
55
+
56
+ from gensim.models import FastText
57
+ from wordcloud import WordCloud
58
+ from nltk.corpus import stopwords
59
+ from nltk.tokenize import word_tokenize
60
+ from nltk.stem import WordNetLemmatizer
61
+ from nltk.stem.porter import PorterStemmer
62
+ from tensorflow.keras.preprocessing.text import Tokenizer
63
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
64
+ from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
65
+ from sklearn.model_selection import train_test_split
66
+ from sklearn.metrics import accuracy_score, classification_report
67
+ import joblib
68
+
69
+ import nltk
70
+
71
+ mo.md("""
72
+ ## 1. Import all libraries needed
73
+
74
+ The initial cells import the necessary libraries for data handling, visualization, and word embedding.
75
+ """)
76
+ return (
77
+ Bidirectional,
78
+ Dense,
79
+ EarlyStopping,
80
+ Embedding,
81
+ FastText,
82
+ LSTM,
83
+ PorterStemmer,
84
+ ReduceLROnPlateau,
85
+ Sequential,
86
+ TSNE,
87
+ Tokenizer,
88
+ UMAP,
89
+ Word2Vec,
90
+ WordCloud,
91
+ WordNetLemmatizer,
92
+ accuracy_score,
93
+ alt,
94
+ classification_report,
95
+ joblib,
96
+ mo,
97
+ nltk,
98
+ np,
99
+ os,
100
+ pad_sequences,
101
+ pd,
102
+ plt,
103
+ random,
104
+ re,
105
+ sns,
106
+ stopwords,
107
+ string,
108
+ tf,
109
+ train_test_split,
110
+ word_tokenize,
111
+ )
112
+
113
+
114
+ @app.cell(hide_code=True)
115
+ def __():
116
+ return
117
+
118
+
119
+ @app.cell(hide_code=True)
120
+ def __(mo):
121
+ mo.md(
122
+ """
123
+ Here are the mapped of label and issue type columns.
124
+
125
+ ```yaml
126
+ Label Mapping: {'conservative': 0, 'liberal': 1 }
127
+ Issue Type Mapping: {
128
+ 'economic': 0, 'environmental': 1,
129
+ 'family/gender': 2, 'geo-political and foreign policy': 3,
130
+ 'political': 4, 'racial justice and immigration': 5,
131
+ 'religious': 6, 'social, health and education': 7
132
+ }
133
+ ```
134
+ """
135
+ )
136
+ return
137
+
138
+
139
+ @app.cell(hide_code=True)
140
+ def __(mo, pd):
141
+ df = pd.read_parquet('train.parquet')
142
+ df_val = pd.read_parquet('val.parquet')
143
+ df_test = pd.read_parquet('test.parquet')
144
+
145
+ df = df.drop('__index_level_0__', axis=1)
146
+
147
+ mo.md("""
148
+ ## 2. Dataset Loading
149
+
150
+ The dataset files (`train.parquet`, `val.parquet`, and `test.parquet`) are loaded, concatenated, and cleaned to form a single DataFrame (df). Columns are mapped to readable labels for ease of understanding.
151
+ """)
152
+ return df, df_test, df_val
153
+
154
+
155
+ @app.cell(hide_code=True)
156
+ def __():
157
+ label_mapping = {
158
+ 'conservative': 0,
159
+ 'liberal': 1
160
+ }
161
+
162
+ issue_type_mapping = {
163
+ 'economic': 0,
164
+ 'environmental': 1,
165
+ 'family/gender': 2,
166
+ 'geo-political and foreign policy': 3,
167
+ 'political': 4,
168
+ 'racial justice and immigration': 5,
169
+ 'religious': 6,
170
+ 'social, health and education': 7
171
+ }
172
+ return issue_type_mapping, label_mapping
173
+
174
+
175
+ @app.cell(hide_code=True)
176
+ def __(issue_type_mapping, label_mapping):
177
+ label_mapping_reversed = {v: k for k, v in label_mapping.items()}
178
+ issue_type_mapping_reversed = {v: k for k, v in issue_type_mapping.items()}
179
+
180
+ print(label_mapping_reversed)
181
+ print(issue_type_mapping_reversed)
182
+ return issue_type_mapping_reversed, label_mapping_reversed
183
+
184
+
185
+ @app.cell(hide_code=True)
186
+ def __(df, issue_type_mapping_reversed, label_mapping_reversed, mo):
187
+ df['label_text'] = df['label'].replace(label_mapping_reversed)
188
+ df['issue_type_text'] = df['issue_type'].replace(issue_type_mapping_reversed)
189
+
190
+ labels_grouped = df['label_text'].value_counts().rename_axis('label_text').reset_index(name='counts')
191
+ issue_types_grouped = (
192
+ df["issue_type_text"]
193
+ .value_counts()
194
+ .rename_axis("issue_type_text")
195
+ .reset_index(name="counts")
196
+ )
197
+
198
+ mo.md("""
199
+ ## 3. Mapping Labels and Issue Types
200
+
201
+ Two dictionaries map labels (conservative and liberal) and issue types (e.g., economic, environmental, etc.) to numerical values for machine learning purposes. Reversed mappings are created to convert numerical labels back into their text form.
202
+ """)
203
+ return issue_types_grouped, labels_grouped
204
+
205
+
206
+ @app.cell(hide_code=True)
207
+ def __(df):
208
+ df.iloc[:, :6].head(7)
209
+ return
210
+
211
+
212
+ @app.cell(hide_code=True)
213
+ def __(mo):
214
+ mo.md(
215
+ """
216
+ ## 4. Visualizing Data Distributions
217
+
218
+ Bar plots visualize the proportions of conservative vs. liberal ideologies and the count of different issue types. These provide an overview of the dataset composition.
219
+ """
220
+ )
221
+ return
222
+
223
+
224
+ @app.cell(hide_code=True)
225
+ def __(alt, labels_grouped, mo):
226
+ mo.ui.altair_chart(
227
+ alt.Chart(labels_grouped).mark_bar(
228
+ fill='#4C78A8',
229
+ cursor='pointer',
230
+ ).encode(
231
+ x=alt.X('label_text', axis=alt.Axis(labelAngle=0)),
232
+ y='counts:Q'
233
+ )
234
+ )
235
+ return
236
+
237
+
238
+ @app.cell(hide_code=True)
239
+ def __(alt, issue_types_grouped, mo):
240
+ mo.ui.altair_chart(
241
+ alt.Chart(issue_types_grouped)
242
+ .mark_bar(
243
+ fill="#4C78A8",
244
+ cursor="pointer",
245
+ )
246
+ .encode(
247
+ x=alt.X(
248
+ "issue_type_text:O",
249
+ axis=alt.Axis(
250
+ labelAngle=-10, labelAlign="center", labelPadding=10
251
+ ),
252
+ ),
253
+ y="counts:Q",
254
+ )
255
+ )
256
+ return
257
+
258
+
259
+ @app.cell(hide_code=True)
260
+ def __(mo):
261
+ mo.md(
262
+ r"""
263
+ ## 5. Text Preprocessing
264
+
265
+ Texts preprocessed to remove any ineffective words.
266
+ """
267
+ )
268
+ return
269
+
270
+
271
+ @app.cell(hide_code=True)
272
+ def __(WordCloud, df):
273
+ all_text = ''.join(df['statement'])
274
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
275
+ return all_text, wordcloud
276
+
277
+
278
+ @app.cell(hide_code=True)
279
+ def __(plt, wordcloud):
280
+ plt.figure(figsize=(10, 5))
281
+ plt.imshow(wordcloud, interpolation='bilinear')
282
+ plt.axis='off'
283
+ plt.plot()
284
+ plt.gca()
285
+ return
286
+
287
+
288
+ @app.cell(hide_code=True)
289
+ def __(WordNetLemmatizer, nltk, stopwords):
290
+ nltk.download('punkt')
291
+ nltk.download('stopwords')
292
+ nltk.download('wordnet')
293
+
294
+ lemmatizer = WordNetLemmatizer()
295
+ stop_words = set(stopwords.words('english'))
296
+ return lemmatizer, stop_words
297
+
298
+
299
+ @app.cell(hide_code=True)
300
+ def __(lemmatizer, re, stop_words, word_tokenize):
301
+ # Function for preprocessing text
302
+ def preprocess_text(text):
303
+ # 1. Lowercase the text
304
+ text = text.lower()
305
+
306
+ # 2. Remove punctuation and non-alphabetical characters
307
+ text = re.sub(r'[^a-z\s]', '', text)
308
+
309
+ # 3. Tokenize the text
310
+ tokens = word_tokenize(text)
311
+
312
+ # 4. Remove stopwords and lemmatize each token
313
+ processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
314
+
315
+ return processed_tokens
316
+ return (preprocess_text,)
317
+
318
+
319
+ @app.cell(hide_code=True)
320
+ def __(df, df_test, df_val, preprocess_text):
321
+ # Terapkan fungsi preprocessing pada kolom 'statement'
322
+ df['processed_statement'] = df['statement'].apply(preprocess_text)
323
+ df_val['processed_statement'] = df_val['statement'].apply(preprocess_text)
324
+ df_test['processed_statement'] = df_test['statement'].apply(preprocess_text)
325
+ processed_statement = df['processed_statement']
326
+ return (processed_statement,)
327
+
328
+
329
+ @app.cell(hide_code=True)
330
+ def __(mo):
331
+ mo.md(r"""## 6. Word Embeddings""")
332
+ return
333
+
334
+
335
+ @app.cell(hide_code=True)
336
+ def __(np):
337
+ def get_doc_embedding(tokens, embeddings_model):
338
+ vectors = [embeddings_model.wv[word] for word in tokens if word in embeddings_model.wv]
339
+ if vectors:
340
+ return np.mean(vectors, axis=0)
341
+ else:
342
+ return np.zeros(embeddings_model.vector_size)
343
+ return (get_doc_embedding,)
344
+
345
+
346
+ @app.cell(hide_code=True)
347
+ def __(FastText, Word2Vec, processed_statement):
348
+ embedding_models = {
349
+ 'fasttext': FastText(sentences=processed_statement, vector_size=100, window=3, min_count=1, seed=0, workers=1),
350
+ 'word2vec': Word2Vec(sentences=processed_statement, vector_size=100, window=3, min_count=1, seed=0, workers=1)
351
+ }
352
+ return (embedding_models,)
353
+
354
+
355
+ @app.cell(hide_code=True)
356
+ def __(mo):
357
+ mo.md(r"""### 6.1 Word Embedding using FastText and Word2Vec""")
358
+ return
359
+
360
+
361
+ @app.cell(hide_code=True)
362
+ def __(mo):
363
+ mo.md(
364
+ """
365
+ #### Dimensionality Reduction using UMAP
366
+
367
+ Embeddings are projected into a 2D space using UMAP for visualization. The embeddings are colored by issue type, showing clusters of similar statements.
368
+
369
+ Interactive scatter plots in Altair show ideology and issue types in 2D space. A brush selection tool allows users to explore specific points and view tooltip information.
370
+
371
+ #### Combined Scatter Plot
372
+
373
+ Combines the two scatter plots into a side-by-side visualization for direct comparison of ideologies vs. issue types.
374
+ Running the Code
375
+
376
+ Run the code using the marimo.App instance. This notebook can also be run as a standalone Python script:
377
+ """
378
+ )
379
+ return
380
+
381
+
382
+ @app.cell(hide_code=True)
383
+ def __(UMAP, alt, df, mo, np):
384
+ def word_embedding_2d(embedding_model, embedding_model_name):
385
+ embeddings_matrix = np.vstack(df[f'embeddings_{embedding_model_name}'].values)
386
+
387
+ umap = UMAP(n_components=2, random_state=42)
388
+ umap_results = umap.fit_transform(embeddings_matrix)
389
+
390
+ df[f'{embedding_model_name}_x'] = umap_results[:, 0]
391
+ df[f'{embedding_model_name}_y'] = umap_results[:, 1]
392
+
393
+ brush = alt.selection_interval()
394
+ size = 350
395
+
396
+ points1 = alt.Chart(df, height=size, width=size).mark_point().encode(
397
+ x=f'{embedding_model_name}_x:Q',
398
+ y=f'{embedding_model_name}_y:Q',
399
+ color=alt.condition(brush, 'label_text', alt.value('grey')),
400
+ tooltip=[f'{embedding_model_name}_x:Q', f'{embedding_model_name}_y:Q', 'statement:N', 'label_text:N']
401
+ ).add_params(brush).properties(title='By Political Ideologies')
402
+
403
+ scatter_chart1 = mo.ui.altair_chart(points1)
404
+
405
+ points2 = alt.Chart(df, height=size, width=size).mark_point().encode(
406
+ x=f'{embedding_model_name}_x:Q',
407
+ y=f'{embedding_model_name}_y:Q',
408
+ color=alt.condition(brush, 'issue_type_text', alt.value('grey')),
409
+ tooltip=[f'{embedding_model_name}_x:Q', f'{embedding_model_name}_y:Q', 'statement:N', 'issue_type:N']
410
+ ).add_params(brush).properties(title='By Issue Types')
411
+
412
+ scatter_chart2 = mo.ui.altair_chart(points2)
413
+
414
+ combined_chart = (scatter_chart1 | scatter_chart2)
415
+ return combined_chart
416
+ return (word_embedding_2d,)
417
+
418
+
419
+ @app.cell(hide_code=True)
420
+ def __(
421
+ df,
422
+ df_test,
423
+ df_val,
424
+ embedding_models,
425
+ get_doc_embedding,
426
+ word_embedding_2d,
427
+ ):
428
+ for name, embedding_model in embedding_models.items():
429
+ df['embeddings_' + name] = df['processed_statement'].apply(lambda x: get_doc_embedding(x, embedding_model))
430
+ df_val['embeddings_' + name] = df_val['processed_statement'].apply(lambda x: get_doc_embedding(x, embedding_model))
431
+ df_test['embeddings_' + name] = df_test['processed_statement'].apply(lambda x: get_doc_embedding(x, embedding_model))
432
+
433
+ fasttext_plot = word_embedding_2d(embedding_models['fasttext'], 'fasttext')
434
+ word2vec_plot = word_embedding_2d(embedding_models['word2vec'], 'word2vec')
435
+
436
+ test_embeddings_fasttext = df_test['embeddings_fasttext']
437
+ return (
438
+ embedding_model,
439
+ fasttext_plot,
440
+ name,
441
+ test_embeddings_fasttext,
442
+ word2vec_plot,
443
+ )
444
+
445
+
446
+ @app.cell(hide_code=True)
447
+ def __(fasttext_plot, mo):
448
+ fasttext_table = fasttext_plot.value[['statement', 'label_text', 'issue_type_text']]
449
+ fasttext_chart = mo.vstack([
450
+ fasttext_plot,
451
+ fasttext_table
452
+ ])
453
+ return fasttext_chart, fasttext_table
454
+
455
+
456
+ @app.cell(hide_code=True)
457
+ def __(mo, word2vec_plot):
458
+ word2vec_table = word2vec_plot.value[['statement', 'label_text', 'issue_type_text']]
459
+ word2vec_chart = mo.vstack([
460
+ word2vec_plot,
461
+ word2vec_table
462
+ ])
463
+ return word2vec_chart, word2vec_table
464
+
465
+
466
+ @app.cell(hide_code=True)
467
+ def __(fasttext_chart, mo, word2vec_chart):
468
+ mo.ui.tabs({
469
+ 'FastText': fasttext_chart,
470
+ 'Word2Vec': word2vec_chart
471
+ })
472
+ return
473
+
474
+
475
+ @app.cell(hide_code=True)
476
+ def __(mo):
477
+ mo.md(
478
+ r"""
479
+ ## Data Insights
480
+
481
+ - Ideology Distribution: Visualizes proportions of conservative and liberal ideologies.
482
+ - Issue Types: Bar plot reveals the diversity and frequency of issue types in the dataset.
483
+ - Word Embeddings: Using UMAP for 2D projections helps identify clusters in political statements.
484
+ - Interactive Exploration: Offers detailed, interactive views on ideology vs. issue type distribution.
485
+
486
+ This code provides a thorough analysis pipeline, from data loading to interactive visualizations, enabling an in-depth exploration of political ideologies.
487
+ """
488
+ )
489
+ return
490
+
491
+
492
+ @app.cell(hide_code=True)
493
+ def __(mo):
494
+ mo.md(
495
+ r"""
496
+ ## Building Model
497
+
498
+ ```python
499
+ clf_model = Sequential()
500
+ clf_model.add(Bidirectional(tf.keras.layers.GRU(64,
501
+ activation='relu',
502
+ # return_sequences=True,
503
+ input_shape=(sent_length, input_dim),
504
+ kernel_regularizer=tf.keras.regularizers.l2(0.001)))) # L2 regularization
505
+ clf_model.add(tf.keras.layers.Dropout(0.5))
506
+ clf_model.add(Dense(2,
507
+ activation='softmax',
508
+ kernel_regularizer=tf.keras.regularizers.l2(0.001))) # L2 regularization in the Dense layer
509
+ ```
510
+ """
511
+ )
512
+ return
513
+
514
+
515
+ @app.cell(hide_code=True)
516
+ def __(df_test, np, test_embeddings_fasttext):
517
+ # X_train = np.array(df['embeddings_fasttext'].tolist())
518
+ # X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
519
+ # y_train = df['label'].values
520
+
521
+ # X_val = np.array(df_val['embeddings_fasttext'].tolist())
522
+ # X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
523
+ # y_val = df_val['label'].values
524
+
525
+ X_test = np.array(test_embeddings_fasttext.tolist())
526
+ X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
527
+ y_test = df_test['label'].values
528
+ return X_test, y_test
529
+
530
+
531
+ @app.cell(hide_code=True)
532
+ def __():
533
+ # all_tokens = [token for tokens in df['processed_statement'] for token in tokens]
534
+ # vocab_size = len(set(all_tokens))
535
+ # vocab_size
536
+ # input_dim = X_train.shape[1] # Dimensi dari embedding yang digunakan (misalnya 50 atau 100)
537
+ # sent_length = X_train.shape[1] # Ukuran dimensi per embedding
538
+
539
+ # input_dim, sent_length
540
+ return
541
+
542
+
543
+ @app.cell(hide_code=True)
544
+ def __():
545
+ # seed_value = 345
546
+ # np.random.seed(seed_value)
547
+ # random.seed(seed_value)
548
+ # tf.random.set_seed(seed_value)
549
+
550
+ # clf_model = Sequential()
551
+ # clf_model.add(Bidirectional(tf.keras.layers.GRU(64,
552
+ # activation='relu',
553
+ # # return_sequences=True,
554
+ # input_shape=(sent_length, input_dim),
555
+ # kernel_regularizer=tf.keras.regularizers.l2(0.001)))) # L2 regularization
556
+ # clf_model.add(tf.keras.layers.Dropout(0.5))
557
+ # clf_model.add(Dense(2,
558
+ # activation='softmax',
559
+ # kernel_regularizer=tf.keras.regularizers.l2(0.001))) # L2 regularization in the Dense layer
560
+
561
+ # clf_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
562
+ # clf_model.summary()
563
+ return
564
+
565
+
566
+ @app.cell(hide_code=True)
567
+ def __():
568
+ # lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-10)
569
+
570
+ # model_history = clf_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=16, verbose=2, callbacks=[lr_scheduler])
571
+ return
572
+
573
+
574
+ @app.cell(hide_code=True)
575
+ def __():
576
+ # clf_model.save('models/model_8812.keras')
577
+ # joblib.dump(model_history, 'history/history_model_8812.pkl')
578
+ return
579
+
580
+
581
+ @app.cell(hide_code=True)
582
+ def __():
583
+ loaded_model = tf.keras.models.load_model('models/model_8812.keras')
584
+ model_history_loaded = joblib.load('history/history_model_8812.pkl')
585
+
586
+ # loaded_model = clf_model
587
+ # model_history_loaded = model_history
588
+ return
589
+
590
+
591
+ @app.cell(hide_code=True)
592
+ def __(model_history_loaded, pd):
593
+ history_data = {
594
+ 'epoch': range(1, len(model_history_loaded.history['accuracy']) + 1),
595
+ 'accuracy': model_history_loaded.history['accuracy'],
596
+ 'val_accuracy': model_history_loaded.history['val_accuracy'],
597
+ 'loss': model_history_loaded.history['loss'],
598
+ 'val_loss': model_history_loaded.history['val_loss']
599
+ }
600
+
601
+ history_df = pd.DataFrame(history_data)
602
+ return history_data, history_df
603
+
604
+
605
+ @app.cell(hide_code=True)
606
+ def __(alt, history_df, mo):
607
+ accuracy_chart = alt.Chart(history_df).transform_fold(
608
+ ['accuracy', 'val_accuracy'],
609
+ as_=['type', 'accuracy']
610
+ ).mark_line().encode(
611
+ x='epoch:Q',
612
+ y='accuracy:Q',
613
+ color='type:N',
614
+ tooltip=['epoch', 'accuracy']
615
+ ).properties(title='Training and Validation Accuracy')
616
+
617
+ loss_chart = alt.Chart(history_df).transform_fold(
618
+ ['loss', 'val_loss'],
619
+ as_=['type', 'loss']
620
+ ).mark_line().encode(
621
+ x='epoch:Q',
622
+ y='loss:Q',
623
+ color='type:N',
624
+ tooltip=['epoch', 'loss']
625
+ ).properties(title='Training and Validation Loss')
626
+
627
+ mo.hstack([accuracy_chart | loss_chart])
628
+ return accuracy_chart, loss_chart
629
+
630
+
631
+ @app.cell(hide_code=True)
632
+ def __(X_test, loaded_model, np):
633
+ y_pred = loaded_model.predict(X_test)
634
+ y_pred = np.argmax(y_pred, axis=1)
635
+ return (y_pred,)
636
+
637
+
638
+ @app.cell(hide_code=True)
639
+ def __(accuracy_score, mo, y_pred, y_test):
640
+ mo.md(f"Accuracy score: **{round(accuracy_score(y_test, y_pred) * 100, 2)}**%")
641
+ return
642
+
643
+
644
+ @app.cell(hide_code=True)
645
+ def __(classification_report, mo, y_pred, y_test):
646
+ with mo.redirect_stdout():
647
+ print(classification_report(y_test, y_pred))
648
+ return
649
+
650
+
651
+ @app.cell(hide_code=True)
652
+ def __(embedding_models, get_doc_embedding, loaded_model, preprocess_text):
653
+ def try_predict(text):
654
+ tokenized = preprocess_text(text)
655
+ embedded = get_doc_embedding(tokenized, embedding_models['fasttext'])
656
+ embedded = embedded.reshape(1, 1, -1)
657
+ prediction = loaded_model.predict(embedded)
658
+ predicted_class = prediction.argmax(axis=-1)
659
+ predicted_class = "Progressive" if predicted_class == 1 else "Conservative"
660
+ return predicted_class
661
+ return (try_predict,)
662
+
663
+
664
+ @app.cell(hide_code=True)
665
+ def __():
666
+ def validate(value):
667
+ if len(value.split()) < 15:
668
+ return 'Please enter more than 15 words.'
669
+ return (validate,)
670
+
671
+
672
+ @app.cell(hide_code=True)
673
+ def __(mo, validate):
674
+ form = mo.ui.text_area(placeholder="...").form(validate=validate)
675
+ return (form,)
676
+
677
+
678
+ if __name__ == "__main__":
679
+ app.run()