-
Notifications
You must be signed in to change notification settings - Fork 1
/
setup_database.py
111 lines (95 loc) · 4.25 KB
/
setup_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from pymongo import MongoClient
import pandas as pd
import os
import warnings
# Suppress specific openpyxl warnings
warnings.filterwarnings("ignore", category=UserWarning, module='openpyxl')
# Connect to MongoDB
client = MongoClient('localhost', 27017)
db = client['sikorsky_archives']
def normalize_date(date_str):
"""Normalize date to month/day/year format."""
try:
date_obj = pd.to_datetime(date_str, errors='coerce')
if pd.isna(date_obj):
return None
return date_obj.strftime('%m/%d/%Y')
except Exception as e:
print(f"Error parsing date: {date_str} - {e}")
return None
def process_aircraft_model(model_str):
"""Split aircraft model string into a list if it contains multiple values."""
if isinstance(model_str, str):
return [model.strip() for model in model_str.split(',')]
return [model_str]
def preprocess_comments(comments):
"""Preprocess comments if needed."""
if isinstance(comments, str):
return comments.strip()
return comments
def handle_nan_values(row):
"""Replace NaN values with None."""
return {key: (value if pd.notna(value) else None) for key, value in row.items()}
def validate_data(row):
"""Validate and clean the data row by row."""
row = handle_nan_values(row)
# Validate Date format
row['Date'] = normalize_date(row.get('Date'))
# Normalize Aircraft_Model
row['Aircraft_Model'] = process_aircraft_model(row.get('Aircraft_Model', None))
# Preprocess comments
row['Comment'] = preprocess_comments(row.get('Comment', ''))
return row
def process_excel_file(file_path, collection_name):
try:
df = pd.read_excel(file_path)
# Prepare documents for MongoDB
documents = []
for _, row in df.iterrows():
cleaned_row = validate_data(row)
document = {
"searchTags": [cleaned_row.get('Search_1', ''), cleaned_row.get('Search_2', '')],
"itemNumber": cleaned_row.get('Item_Number', ''),
"indexes": {
"index1": cleaned_row.get('Index_1', ''),
"index2": cleaned_row.get('Index_2', 0)
},
"type": cleaned_row.get('Type', 'Unknown'),
"reference": cleaned_row.get('Reference', 'N/A'),
"fileLink": cleaned_row.get('See_It', 'No link provided'),
"aircraftModel": cleaned_row.get('Aircraft_Model', 'Unknown'),
"description": cleaned_row.get('Description', 'No description provided'),
"names": cleaned_row.get('Names', 'Anonymous'),
"date": cleaned_row.get('Date'),
"comments": cleaned_row.get('Comment', '')
}
documents.append(document)
# Insert documents into MongoDB
if documents:
db[collection_name].insert_many(documents)
print(f"Inserted data from {file_path} into {collection_name} collection")
else:
print(f"No valid data found in {file_path}")
except Exception as e:
print(f"Failed to process {file_path}: {e}")
def main(root_directory):
try:
for root, dirs, files in os.walk(root_directory):
# Process only the main folders (i.e., folders in root_directory)
if root == root_directory:
for dir_name in dirs:
dir_path = os.path.join(root_directory, dir_name)
print(f"Processing directory: {dir_path}")
# Only process files directly in this directory
for file in os.listdir(dir_path):
file_path = os.path.join(dir_path, file)
if os.path.isfile(file_path) and (file.endswith('.xls') or file.endswith('.xlsx')):
print(f"Processing file: {file_path}")
process_excel_file(file_path, dir_name)
print(f"Finished processing directory: {dir_path}")
# Stop after processing the first level of directories
break
except Exception as e:
print(f"Failed to process directory: {e}")
if __name__ == "__main__":
main(r'E:\\Hank Sample File - Copy\DATABASE PROJECT\DOWNLOAD FILES') # Use raw string for the path