RealKintaro's picture
Init
7f9da02
raw
history blame contribute delete
3.3 kB
import re
import string
import nltk
nltk.download('stopwords')
arabic_stopwords = set(nltk.corpus.stopwords.words("arabic"))
arabic_diacritics = re.compile("""
ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""", re.VERBOSE)
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations = arabic_punctuations + english_punctuations
def remove_urls (text):
text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
return text
def remove_emails(text):
text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "", text, flags=re.MULTILINE)
return text
# def remove_emoji(text):
# return emoji.get_emoji_regexp().sub(u'', text)
def remove_emoji(data):
emoj = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return re.sub(emoj, '', data)
def normalization(text):
text = re.sub("[إأآا]", "ا", text)
text = re.sub("ى", "ي", text)
text = re.sub("ؤ", "ء", text)
text = re.sub("ئ", "ء", text)
text = re.sub("ة", "ه", text)
text = re.sub("گ", "ك", text)
return text
def remove_diacritics(text):
text = re.sub(arabic_diacritics, '', text)
return text
def remove_stopwords(text):
filtered_sentence = [w for w in text.split() if not w in arabic_stopwords]
return ' '.join(filtered_sentence)
def cleaning_content(line):
if (isinstance(line, float)):
return None
line.replace('\n', ' ')
line = remove_emails(line)
line = remove_urls(line)
line = remove_emoji(line)
nline = [w if '@' not in w else 'USERID' for w in line.split()]
line = ' '.join(nline)
line = line.replace('RT', '').replace('<LF>', '').replace('<br />','').replace('&quot;', '').replace('<url>', '').replace('USERID', '')
# add spaces between punc,
line = line.translate(str.maketrans({key: " {0} ".format(key) for key in punctuations}))
# then remove punc,
translator = str.maketrans('', '', punctuations)
line = line.translate(translator)
line = remove_stopwords(line)
line=remove_diacritics(normalization(line))
line = line.strip()
return line
def hasDigits(s):
return any( 48 <= ord(char) <= 57 or 1632 <= ord(char) <= 1641 for char in s)