-
Notifications
You must be signed in to change notification settings - Fork 0
/
file_loader.py
80 lines (68 loc) · 2.38 KB
/
file_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import requests
import io
import extract_msg
import numpy as np
import pandas as pd
import pdfplumber
import tabula
import textract
# !pip install utils --user
# !pip install tabula-py
# !pip install --user textract
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
def get_mail(file_path):
msg = extract_msg.Message(file_path)
return msg
def mail2df(msg):
my_list = []
my_list.append([msg.filename, msg.sender, msg.to, msg.date, msg.subject, msg.body,
msg.message_id])
df = pd.DataFrame(my_list, columns=['File Name', 'From', 'To', 'Date', 'Subject', 'MailBody Text', 'Message ID'])
msg.close()
return df['MailBody Text']
def open_file(file_name, clean_up=True):
df = pd.DataFrame()
file_extension = file_name.split('.')[-1].lower()
if file_extension == 'msg':
msg = get_mail(file_name)
df = mail2df(msg)
msg.close()
elif file_extension == 'csv':
df = pd.read_csv(file_name)
elif file_extension == 'txt':
with open(file_name, 'r') as file:
df = file.read().replace('\n', '')
elif file_extension == 'xlsx' or file_extension == 'xls':
df = pd.read_excel(file_name)
elif file_extension == 'docx' or file_extension == 'doc':
df = textract.process(file_name).decode()
elif file_extension == 'pdf':
df = tabula.read_pdf(file_name, pages='all')
if len(df) == 0:
with pdfplumber.open(file_name) as pdf:
pages = pdf.pages
text = [pages[i].extract_text() for i, pg in enumerate(pages) if
isinstance(pages[i].extract_text(), str)]
df = ''.join(text)
pdf.close()
if clean_up == True:
if os.path.isfile(file_name):
os.remove(file_name)
return (df)
def get_csv(fname, link):
response = requests.get(link)
df_csv_file = io.StringIO(response.content.decode('utf-8'))
df = pd.read_csv(df_csv_file)
df.to_csv(fname)
return
def download_csv_file(file_name='currency_codes.csv'):
get_csv(file_name, 'https://www.datahub.io/JohnSnowLabs/iso-4217-currency-codes/r/iso-4217-currency-codes-csv.csv')
return
if __name__ == '__main__':
file_name = 'currency_codes.csv'
download_csv_file()
df = open_file(file_name, clean_up=True)
print(df.head())