StKirill commited on
Commit
e7ade8f
·
verified ·
1 Parent(s): 78d5fc2

Delete parsing.py

Browse files
Files changed (1) hide show
  1. parsing.py +0 -245
parsing.py DELETED
@@ -1,245 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """parsing.ipynb
3
-
4
- Automatically generated by Colaboratory.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1thvkAz498jADcaVirJG91V-3-XBhdkq1
8
- """
9
-
10
- import requests
11
- from bs4 import BeautifulSoup
12
-
13
- import re
14
- import os
15
-
16
- import pandas as pd
17
- import numpy as np
18
-
19
- from tqdm import tqdm
20
-
21
- def get_transcripts_from_url(url):
22
- # Send a GET request to the URL and retrieve the webpage content
23
- response = requests.get(url)
24
-
25
- # Parse the HTML content using Beautiful Soup
26
- soup = BeautifulSoup(response.content, 'html.parser')
27
-
28
- # Find elements by tag name
29
- titles = soup.find_all('li')
30
-
31
- # names for series
32
- transcript_paths = []
33
- # Extract text from elements
34
- for title in titles:
35
- a = title.find('a')
36
-
37
- path = a.get("href")
38
-
39
- transcript_paths.append("https://fangj.github.io/friends/" + path)
40
-
41
- return transcript_paths
42
-
43
- def get_text_from_html(url):
44
- path = url
45
- response = requests.get(path)
46
- html_content = response.text
47
-
48
- # Parse HTML content
49
- soup = BeautifulSoup(html_content, 'html.parser')
50
-
51
- transcript = soup.find_all('p')
52
-
53
- transcript_name = path.split("/")[-1].replace(".html", ".txt")
54
-
55
- with open(os.path.join("friends_raw_scripts", transcript_name), 'w', encoding='utf-8') as file:
56
- text = soup.get_text(strip=False).lower().replace("'", ""). replace('"', "").replace("\xa0", "")
57
- file.write(text + "\n")
58
-
59
- return transcript_name
60
-
61
- def clean_and_write_text(transcript_name):
62
-
63
- char = []
64
- texts = []
65
- flag = None
66
- pattern = re.compile(r'\b\w+:')
67
-
68
- with open(os.path.join("friends_raw_scripts", transcript_name), 'r', encoding='utf-8') as file:
69
- final_transcript = file.readlines()
70
-
71
- skip_lines = 10
72
- pattern = re.compile(r'\b\w+:')
73
- scene_words = ["commercial break", "closing credits", "opening credits", "end"]
74
- for ind in range(1, len(final_transcript) - 1):
75
- final_list = []
76
-
77
- pre_line = final_transcript[ind - 1].strip()
78
- cur_line = final_transcript[ind].strip()
79
- next_line = final_transcript[ind + 1].strip()
80
-
81
- next_condition = re.sub(r"\([^()]*\)|\[[^\[\]]*\]", '', next_line).strip()
82
- cur_conditon = re.sub(r"\([^()]*\)|\[[^\[\]]*\]", '', cur_line).strip()
83
-
84
- if sum([bool(pre_line), bool(cur_line), bool(next_line)]) == 1:
85
- continue
86
-
87
- elif cur_line in scene_words:
88
- continue
89
-
90
- elif "by:" in cur_line or "note:" in cur_line:
91
- continue
92
-
93
- elif "[" in cur_line or "]" in cur_line:
94
- continue
95
-
96
- elif not cur_conditon:
97
- continue
98
-
99
- elif pattern.search(cur_line) and flag == None:
100
- name, text = cur_line.split(":", maxsplit=1)
101
- char.append(name)
102
- text = re.sub(r'\([^)]*\)', '', text)
103
- text = text.strip()
104
- flag = "char"
105
-
106
- if pattern.search(next_line) or not next_condition or next_line in scene_words or "[" in next_line:
107
- texts.append(text)
108
- flag = None
109
-
110
- if len(char) != len(texts):
111
- print(ind)
112
- print(char[-1], texts[-1])
113
-
114
- elif cur_line and flag == 'char':
115
- text += " " + cur_line
116
- if pattern.search(next_line) or not next_condition or next_line in scene_words or "[" in next_line:
117
- text = re.sub(r"\([^()]*\)|\[[^\[\]]*\]", '', text).strip()
118
- texts.append(text)
119
- flag = None
120
-
121
- if len(char) != len(texts):
122
- print(ind)
123
- print(char[-1], texts[-1])
124
-
125
- new_name = "pre_" + transcript_name
126
- with open(os.path.join("friends_preprocessed_scripts", new_name), 'w', encoding='utf-8') as file:
127
- for c, d in zip(char, texts):
128
- file.write(f"{c}: {d}\n")
129
-
130
- raw_texts_exists = False # change on False to download transcripts and preprocess them
131
- # parse data from website to get txt transcripts
132
- transcript_paths = get_transcripts_from_url("https://fangj.github.io/friends/")
133
-
134
- transcript_paths[:10]
135
-
136
- os.makedirs("friends_preprocessed_scripts", exist_ok=True)
137
- os.makedirs("friends_raw_scripts", exist_ok=True)
138
-
139
- # define list with certain scripts from south park
140
- # dir_list = [file for file in os.listdir("./raw_scripts")]
141
- if not raw_texts_exists:
142
- print("Parse all scripts from this website https://fangj.github.io/friends/")
143
- for path in tqdm(transcript_paths, desc='Total'):
144
- transcript_name = get_text_from_html(path)
145
- clean_and_write_text(transcript_name)
146
-
147
- dir_list = [file for file in os.listdir("./friends_preprocessed_scripts")]
148
-
149
- def df_scripts(path):
150
- """function take preprocessed_script.txt from dir and create dataframes"""
151
- chars = []
152
- texts = []
153
-
154
- with open(os.path.join("friends_preprocessed_scripts", path), 'r', encoding="utf-8") as file:
155
- for line in file:
156
- char, text = line.split(":", 1)
157
- chars.append(char)
158
- texts.append(text.strip().lower())
159
-
160
- df_name = path.replace("prep_SP_", "df_").replace(".txt", ".csv")
161
- df = pd.DataFrame({'Characters': chars, 'Dialogs': texts})
162
- df.to_csv(os.path.join("dataframes", "friends", df_name), index=False)
163
-
164
- os.makedirs("dataframes/friends", exist_ok=True)
165
-
166
- for preprocessed_script in dir_list:
167
- df_scripts(preprocessed_script)
168
-
169
- def collect_df():
170
- """function concatenate dataframes in one single dataframe"""
171
- dfs = []
172
- for file in os.listdir("dataframes/friends"):
173
- dfs.append(pd.read_csv(os.path.join("dataframes", "friends", file)))
174
- df = pd.concat(dfs, ignore_index=True).dropna().reset_index(drop=True)
175
- # print(df["Characters"].value_counts()[:10])
176
- return df
177
-
178
- """### Which most frequent characters we can meet in the movie"""
179
-
180
- def form_df(df, char):
181
- # get indices where character is favorite_character
182
- favorite_character_df = df[df.Characters == char] # .dropna()
183
- favorite_character_ind = favorite_character_df.index.tolist()
184
-
185
- # get indices where speech could be to favorite charecter
186
- text_to_favorite_character_ind = (np.array(favorite_character_ind) - 1).tolist()
187
-
188
- # form datasets with favorite charecter's dialogs and possible dialogs to favorite charecter
189
- # favorite_character_dialog = df.iloc[favorite_character_ind] restore
190
- favorite_character_dialog = df[df.index.isin(favorite_character_ind)]
191
- # text_to_favorite_character = df.iloc[text_to_favorite_character_ind] restore# .dropna(subset=["Dialogs"])
192
- text_to_favorite_character = df[df.index.isin(text_to_favorite_character_ind)]
193
- # remove from text to cartman rows where speak Cartman
194
- text_to_favorite_character = text_to_favorite_character[text_to_favorite_character["Characters"] != char]
195
-
196
- # save data for debugging. Uncomment if necessary
197
- # favorite_character_dialog.to_csv("test_favotite.csv", index=favorite_character_ind)
198
- # text_to_favorite_character.to_csv("test_question.csv", index=text_to_favorite_character_ind)
199
-
200
- # find in dialog_to_cartman lines with char "?"
201
- # mask = text_to_favorite_character['Dialogs'].str.contains('\?')
202
-
203
- # question_to_favorite_character = text_to_favorite_character[mask]
204
- question_to_favorite_character = text_to_favorite_character # delete if return question char ?
205
- # save data for debugging. Uncomment if necessary
206
- # question_to_favorite_character.to_csv("question_to_favorite_character.csv")
207
-
208
- question_to_favorite_character_ind = question_to_favorite_character.index.tolist()
209
- true_answers_ind = (np.array(question_to_favorite_character_ind) + 1).tolist()
210
- # favorite_character_answer = favorite_character_dialog.loc[true_answers_ind]
211
- favorite_character_answer = favorite_character_dialog[favorite_character_dialog.index.isin(true_answers_ind)]
212
- # save data for debugging. Uncomment if necessary
213
- # favorite_character_answer.to_csv("favorite_character_answer.csv")
214
-
215
- # change name of columns for final dataframe
216
- question_to_favorite_character = question_to_favorite_character.rename(
217
- columns={"Characters": "questioner", "Dialogs": "question"})
218
- favorite_character_answer = favorite_character_answer.rename(columns={"Characters": char, "Dialogs": "answer"})
219
-
220
- question_to_favorite_character.reset_index(inplace=True, drop=True)
221
- favorite_character_answer.reset_index(inplace=True, drop=True)
222
-
223
- df = pd.concat([question_to_favorite_character, favorite_character_answer], axis=1)
224
-
225
- return df
226
-
227
- """## Choose your favorite character"""
228
-
229
- # concatenate data in one single dataframe
230
- df = collect_df()
231
- df.to_csv("full_trancscripts.csv", index=False)
232
-
233
- # form the final dataset for tf-idf / word2vec, which no need labels between strings
234
- characters = ["rachel", "ross", "chandler", "monica", "joey", "phoebe"]
235
-
236
- print()
237
- for char in characters:
238
- print(f"Prepare data for {char} -> {char}_friends.csv")
239
- df_char = form_df(df, char)
240
- # create final dataframe
241
- df_char.to_csv(char + "_friends.csv", index=False)
242
-
243
- print("scripts created")
244
-
245
-