forked from exageraldo/raspagem-ordens-camara
-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_pdf.py
169 lines (138 loc) · 5.09 KB
/
read_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import json
import itertools
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import SpaceTokenizer
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
import PyPDF2
PARTIDOS = [
'PRTB', 'PCB', 'PSTU', 'PP', 'PTdoB', 'PR',
'PSOL', 'PRB', 'PSL', 'PTN', 'PCO', 'PSDC',
'PHS', 'PMDB', 'PPS', 'PRP', 'PMN', 'PSC',
'PSDB', 'PSB', 'PCdoB', 'DEM', 'PTC', 'PT',
'PTB', 'PV', 'PDT', 'PATRIOTA', 'AVANTE',
'PMB', 'PSD', 'PROS', 'SD', 'MDB'
]
PUNCTUATIONS = ['(', ')', ';', ':', '[', ']', ',', '-']
TO_SCAPE = ['\n', '\\', '-', 'Œ']
def replace_all(symbol_scape, content):
new_str = content
for symbol in symbol_scape:
new_str = new_str.replace(symbol, '')
return new_str
def find_orador(keywords, limit=150):
find_orador = False
oradores = []
for keyword in keywords:
if find_orador and re.search(r'\d\dh', keyword):
find_orador = False
if find_orador and keyword != '.':
oradores.append(keyword)
if keyword == "ORADOR":
find_orador = True
oradores = [s.split('.') for s in oradores if s not in PARTIDOS]
oradores = list(itertools.chain.from_iterable(oradores))
oradores_final = []
count = -1
for s in oradores:
if re.search(r'VER[.ª]*[ª.]*', s):
count += 1
oradores_final.append('')
elif s != 'a':
oradores_final[count] += s + ' '
return [s.rstrip() for s in oradores_final][0:limit]
def find_topics(keywords):
flag_titulo = False
flag_responsavel = False
flag_assunto = False
flag_movimento = False
flag_start_content = False
pauta = {
'tipo': '',
'n': '',
'responsavel': '',
'movimento': '',
'assunto': ''
}
list_pautas = []
for keyword in keywords:
if keyword == 'PAUTA':
flag_start_content = True
if not flag_start_content:
continue
if keyword == 'ASSUNTO' and flag_titulo and not flag_movimento:
pauta['tipo'] = pauta['tipo'].rstrip().replace(' .', '')
responsavel = re.sub(
r'VER[.a]*[.ª]*[ .]* ', '',
pauta['responsavel'].rstrip()).replace(' .', '')
pauta['responsavel'] = responsavel
pauta = {
'tipo': '',
'n': '',
'responsavel': '',
'movimento': '',
'assunto': ''
}
flag_titulo = False
flag_assunto = True
continue
if keyword == 'MOVIMENTO' and flag_assunto and not flag_titulo:
flag_assunto = False
flag_movimento = True
continue
if keyword in ['PROJETO', 'REQUERIMENTO', 'MOÇÃO'] and not flag_assunto:
if flag_movimento or flag_start_content:
if flag_movimento:
pauta['assunto'] = pauta['assunto'].replace(
' . ', '')
pauta['movimento'] = responsavel = re.sub(
r'ESTADO DO RIO GRANDE DO NORTE CÂMARA MUNICIPAL DO NATAL PALÁCIO PADRE MIGUELINHO \d[\d]*', '',
pauta['movimento'].rstrip()).replace('.', '').rstrip()
list_pautas.append(pauta)
flag_titulo = True
flag_assunto = flag_movimento = flag_responsavel = False
else:
continue
if flag_assunto and not flag_titulo:
pauta['assunto'] += keyword + ' '
if flag_movimento and not flag_assunto:
pauta['movimento'] += keyword + ' '
if flag_titulo and re.search(r'\d/20\d\d', keyword):
pauta['n'] = keyword
flag_responsavel = True
elif flag_titulo and flag_responsavel:
pauta['responsavel'] += keyword + ' '
elif flag_titulo and not re.search(r'^N[.]*[\u00ba]+', keyword):
pauta['tipo'] += keyword + ' '
return list_pautas
pdf_file_obj = open('documents/Abril/ordem_do_dia_07_05_19.pdf', 'rb')
pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
pdf_text = ''
for page in range(pdf_reader.numPages):
pdf_text += pdf_reader.getPage(page).extractText()
pdf_text = replace_all(TO_SCAPE, pdf_text)
tokens = word_tokenize(pdf_text)
keywords = [
word for word in tokens if not word in PUNCTUATIONS]
documento = {}
documento['oradores'] = find_orador(keywords)
documento['pautas'] = find_topics(keywords)
print(json.dumps(documento, sort_keys=True, indent=4, ensure_ascii=False))
stop_words = stopwords.words('portuguese')
content = ''
for pauta in documento['pautas']:
assunto = ' '.join(
[word.lower() for word in pauta['assunto'].split(' ') if not word in stop_words])
content += assunto
wordcloud = WordCloud(width=800, height=800,
background_color='white',
stopwords=set(STOPWORDS),
min_font_size=10).generate(content)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()