-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathex1_pdfminer.py
42 lines (35 loc) · 1.17 KB
/
ex1_pdfminer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 15 17:55:41 2016
@author: Claudio
"""
caminho = os.getcwd()
dia = "01"
mes = "08"
ano = "2016"
caminho = str(caminho) + "\Scripts-py\\" + mes +"-" + ano
nome_arquivo_entrada = caminho + '\IPDO-'+str(dia)+ "-" + mes +"-" + ano +"-unlocked.pdf"
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
import os
# makes a PDF file object
fp = open(nome_arquivo_entrada, 'rb')
# fetch PDF objects from a file stream
parser = PDFParser(fp)
# stores document's structure
document = PDFDocument(parser,password="" )
# check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
pages =[]
for page in PDFPage.create_pages(document):
pages.append(page)