Spaces:

RealKintaro
/

Offensive-Speech-Detection-From-Arabic-Dialects

Runtime error

App Files Files Community

Offensive-Speech-Detection-From-Arabic-Dialects / Deployment /data_cleaning.py

RealKintaro

Init

7f9da02 over 2 years ago

raw

history blame contribute delete

3.3 kB

	import re
	import string
	import nltk
	nltk.download('stopwords')


	arabic_stopwords = set(nltk.corpus.stopwords.words("arabic"))

	arabic_diacritics = re.compile("""
	ّ \| # Tashdid
	َ \| # Fatha
	ً \| # Tanwin Fath
	ُ \| # Damma
	ٌ \| # Tanwin Damm
	ِ \| # Kasra
	ٍ \| # Tanwin Kasr
	ْ \| # Sukun
	ـ # Tatwil/Kashida
	""", re.VERBOSE)

	arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+\|!”…“–ـ'''
	english_punctuations = string.punctuation
	punctuations = arabic_punctuations + english_punctuations


	def remove_urls (text):
	text = re.sub(r'(https\|http)?:\/\/(\w\|\.\|\/\|\?\|\=\|\&\|\%)*\b', '', text, flags=re.MULTILINE)
	return text


	def remove_emails(text):
	text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "", text, flags=re.MULTILINE)
	return text

	# def remove_emoji(text):
	# return emoji.get_emoji_regexp().sub(u'', text)

	def remove_emoji(data):
	emoj = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002500-\U00002BEF" # chinese char
	u"\U00002702-\U000027B0"
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	u"\U0001f926-\U0001f937"
	u"\U00010000-\U0010ffff"
	u"\u2640-\u2642"
	u"\u2600-\u2B55"
	u"\u200d"
	u"\u23cf"
	u"\u23e9"
	u"\u231a"
	u"\ufe0f" # dingbats
	u"\u3030"
	"]+", re.UNICODE)
	return re.sub(emoj, '', data)

	def normalization(text):
	text = re.sub("[إأآا]", "ا", text)
	text = re.sub("ى", "ي", text)
	text = re.sub("ؤ", "ء", text)
	text = re.sub("ئ", "ء", text)
	text = re.sub("ة", "ه", text)
	text = re.sub("گ", "ك", text)
	return text

	def remove_diacritics(text):
	text = re.sub(arabic_diacritics, '', text)
	return text

	def remove_stopwords(text):
	filtered_sentence = [w for w in text.split() if not w in arabic_stopwords]
	return ' '.join(filtered_sentence)

	def cleaning_content(line):
	if (isinstance(line, float)):
	return None
	line.replace('\n', ' ')
	line = remove_emails(line)
	line = remove_urls(line)
	line = remove_emoji(line)
	nline = [w if '@' not in w else 'USERID' for w in line.split()]
	line = ' '.join(nline)
	line = line.replace('RT', '').replace('<LF>', '').replace('<br />','').replace('"', '').replace('<url>', '').replace('USERID', '')


	# add spaces between punc,
	line = line.translate(str.maketrans({key: " {0} ".format(key) for key in punctuations}))

	# then remove punc,
	translator = str.maketrans('', '', punctuations)
	line = line.translate(translator)

	line = remove_stopwords(line)
	line=remove_diacritics(normalization(line))

	line = line.strip()
	return line

	def hasDigits(s):
	return any( 48 <= ord(char) <= 57 or 1632 <= ord(char) <= 1641 for char in s)