-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_spanish_tweets.py
65 lines (45 loc) · 2.12 KB
/
preprocess_spanish_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-
import re, csv
# Spanish special charcters: áéíóúüñâç¿¡
# Spanish special charcters upper-case: ÁÉÍÓÚÜÑÂÇ
def preprocess(tweet):
tweet=re.sub(re.escape(':('), r' tristeza ', tweet)
tweet=re.sub(re.escape(':)'), r' ja ', tweet)
tweet=re.sub(re.escape(':D'), r' jaja ', tweet)
tweet=re.sub(r'xD', r' jaja ', tweet)
tweet=re.sub(r'XD', r' jaja ', tweet)
tweet=re.sub(r'(ja){2,}', r' jaja EMPHASIS ', tweet)
tweet=re.sub(r'(JA){2,}', r' jaja EMPHASIS ', tweet)
tweet=re.sub('([.,!?()¡¿*&/|\:;"$%-=+–_])', r' \1 ', tweet)
tweet=tweet.split()
new_tweet=''
for wd in tweet:
emph_1 = re.findall(r'(([áéíóúüñâçÁÉÍÓÚÜÑÂÇa-zA-Z])\2{2,})', wd)
if len(emph_1)>0:
new_tweet+='EMPHASIS '
for x in emph_1:
wd = wd.replace(x[0], x[1])
if wd.startswith('#') or wd.startswith('@'):
new_tweet+=wd[0]+' '
wd=wd[1:]
sort_e = re.findall(r'[ÁÉÍÓÚÜÑÂÇA-Z]{4,}', wd)
if len(sort_e)>0:
for x in sort_e:
wd=wd[0]+wd[1:].lower()
sort_p = re.findall(r'[^áéíóúüñâçÁÉÍÓÚÜÑÂÇa-zA-Z]', wd)
if len(sort_p)>0:
for x in list(set(sort_p)):
wd=re.sub(x, ' '+x+' ', wd)
sort_l = re.findall(r'[áéíóúüñâça-z][ÁÉÍÓÚÜÑÂÇA-Z][áéíóúüñâça-z]', wd)
if len(sort_l)>0:
for x in sort_l:
wd=re.sub(x, x[0]+' '+x[1:], wd)
sort_c = re.findall(r'[ÁÉÍÓÚÜÑÂÇA-Z][áéíóúüñâça-z][ÁÉÍÓÚÜÑÂÇA-Z]', wd)
if len(sort_c)>0:
for x in sort_c:
wd=re.sub(x, x[:2]+' '+x[2], wd)
new_tweet+=wd+' '
return new_tweet
if __name__=='__main__':
tweet = '"@marianorajoy: En España las cosas se pueden, se deben y se van a hacer infinitamente mejor que estos últimos 4 años" Eso son soluciones!!'
print preprocess(tweet)