-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataSync_Gsheet_DB.py
387 lines (278 loc) · 14.2 KB
/
DataSync_Gsheet_DB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
import gspread
from datetime import datetime
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
import pytz
import mysql.connector
import json
class RealTimeSync:
def __init__(self):
self.table_name = 'sheetData'
# Path to your credentials JSON file obtained from google sheet
self.CREDENTIALS_FILE = 'path to your credentials JSON file '
# Define the scope and authenticate with the credentials file
self.SCOPES = ["https://www.googleapis.com/auth/spreadsheets","https://www.googleapis.com/auth/drive"]
self.creds = Credentials.from_service_account_file(self.CREDENTIALS_FILE, scopes=self.SCOPES)
self.client = client = gspread.authorize(self.creds)
# Open the Google Sheet using the sheet ID from the URL
self.spreadsheet_id = 'your_spreadsheet_id'
self.sheet = client.open_by_key(self.spreadsheet_id).sheet1 # Open the first sheet
# Build the Google Drive API service
self.drive_service = build('drive', 'v3', credentials=self.creds)
# Database connection
#Changes the config as mentioned
self.db = mysql.connector.connect(
host="localhost",
user="root",
password="your_password",
database="your_databaseName"
)
self.cursor = self.db.cursor()
# Function to generate the SQL for adding new columns
def add_column_sql(self, column_name):
return f"ALTER TABLE {self.table_name} ADD `{column_name}` VARCHAR(255);"
# Function to generate the SQL for dropping columns
def drop_column_sql(self, column_name):
return f"ALTER TABLE {self.table_name} DROP COLUMN `{column_name}`;"
# Function to generate the SQL for renaming columns
def rename_column_sql(self, old_column, new_column):
return f"ALTER TABLE {self.table_name} CHANGE `{old_column}` `{new_column}` VARCHAR(255);"
def sanitize_rows(self, sheetRows, empty_string_indices):
"""
Removes elements from each row based on indices where the column is an empty string.
:param sheet_rows: List of rows, where each row is a list of column values
:param empty_string_indices: List of indices where the column values are empty strings
:return: Updated list of rows with empty string columns removed
"""
sanitized_rows = []
for row in sheetRows:
# Remove elements at indices specified in empty_string_indices
sanitized_row = [value for index, value in enumerate(row) if index not in empty_string_indices]
if sanitized_row != []:
sanitized_rows.append(sanitized_row)
return sanitized_rows
def generate_update_sql(self, unique_id, changes, id_column='id'):
"""
Generates an SQL UPDATE query to update specific columns in the database based on changes.
:return: The generated SQL UPDATE query as a string.
"""
# Create a list of 'column = value' strings
set_clause = ", ".join([f"`{column}` = %s" for column in changes])
# Construct the final SQL query
sql_query = f"UPDATE {self.table_name} SET {set_clause} WHERE `{id_column}` = %s"
# Return both the query and the values (for parameterized query)
return sql_query, list(changes.values()) + [unique_id]
def updateValue_db(self, sheetRows, sheetColumns):
self.cursor.execute(f"SELECT * FROM {self.table_name}")
db_data = self.cursor.fetchall()
for i, row in enumerate(sheetRows):
changes = {}
unique_id = i + 1
for j, ele in enumerate(row):
if ele == db_data[i][j + 1]:
continue
else:
changes[sheetColumns[j]] = ele
if changes == {}:
continue
sql_query, params = self.generate_update_sql(unique_id, changes)
self.cursor.execute(sql_query, params)
self.db.commit()
def fetch_timestamp_sheet(self):
file_metadata = self.drive_service.files().get(fileId=self.spreadsheet_id, fields='modifiedTime').execute()
# Print the last modified time
last_modified_time = file_metadata['modifiedTime']
print(f"The Google Sheet was last modified on: {last_modified_time}")
# Convert the ISO time string to a datetime object (assumes UTC timezone because of the 'Z')
utc_time = datetime.strptime(last_modified_time, '%Y-%m-%dT%H:%M:%S.%fZ')
# Define UTC and IST timezones using pytz
utc = pytz.timezone('UTC')
ist = pytz.timezone('Asia/Kolkata')
# Localize the UTC time and convert to IST
utc_time = utc.localize(utc_time)
ist_time = utc_time.astimezone(ist)
# Output the IST time in a readable format
return ist_time.strftime('%Y-%m-%d %H:%M:%S %Z')
def fetch_max_time_stamp(self):
# Find the max timestamp in the database
query = f"""
SELECT MAX(GREATEST(created_at, updated_at)) AS overall_latest_timestamp
FROM {self.table_name};
"""
# Execute the SQL query
self.cursor.execute(query)
# Fetch the result
result = self.cursor.fetchone()
# Extract the timestamp
if result[0] is None:
return None # Handle cases where there are no timestamps
# Assume the timestamp is already in IST, so no timezone conversion is needed
timestamp = result[0]
# Format the timestamp in the same way as fetch_timestamp_sheet
# If it's a datetime object, we format it; if it's a string, we parse it first
if isinstance(timestamp, datetime):
return timestamp.strftime('%Y-%m-%d %H:%M:%S %Z')
else:
# Convert string to datetime if necessary
timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
return timestamp.strftime('%Y-%m-%d %H:%M:%S %Z')
def updateDb(self,sheetRows,sheetColumns):
# print(sheetColumn)
query = f"DESCRIBE {self.table_name}"
self.cursor.execute(query)
# Format: [('Col name1','datatype',...),(...),(...)]
columnsDb = self.cursor.fetchall()
db_column_names = [col[0] for col in columnsDb]
# Filter out 'id', 'created_at', and 'updated_at' from db_column_names
column_names = [col for col in db_column_names if col not in ['id', 'created_at', 'updated_at']]
# Add columns to db
add_columns = []
remove_columns = []
if (len(sheetColumns) > len(column_names)):
start_index = len(column_names)
add_columns = sheetColumns[start_index:]
else:
start_index = len(sheetColumns)
remove_columns = column_names[start_index:]
# Detect renamed columns
renamed_columns = {}
for sheet_col, db_col in zip(sheetColumns, column_names):
if sheet_col != db_col:
renamed_columns[db_col] = sheet_col
print(add_columns, remove_columns, renamed_columns)
alter_queries = []
for col in add_columns:
alter_queries.append(self.add_column_sql(col))
for col in remove_columns:
alter_queries.append(self.drop_column_sql(col))
# Iterate over the renamed_columns dictionary
for old_col, new_col in renamed_columns.items():
# Add the query to the list
alter_queries.append(self.rename_column_sql(old_col, new_col))
# Execute each SQL query to alter the table
for query in alter_queries:
print(f"Executing query: {query}") # Print the query for debugging purposes
self.cursor.execute(query)
self.db.commit()
query = f"SELECT COUNT(*) FROM {self.table_name};"
# Execute the query
self.cursor.execute(query)
row_count = self.cursor.fetchone()[0]
print(row_count)
print(len(sheetRows))
if row_count > len(sheetRows):
self.updateValue_db(sheetRows, sheetColumns)
query1 = f"SELECT id FROM (SELECT id FROM {self.table_name} ORDER BY id LIMIT {len(sheetRows)}) AS temp_table"
self.cursor.execute(f"DELETE FROM {self.table_name} WHERE id NOT IN ({query1}) ")
self.db.commit()
else:
self.updateValue_db(sheetRows, sheetColumns)
self.cursor.close()
print("data update in Db successfully")
def updateSheet(self):
# The range in the sheet where we want to update the values
cell_range = 'H2:H10' # This will update from row 2 to row 10 in column H
# Define the new values to be inserted (as a list of lists)
new_values = [[i] for i in range(4, 13)] # Values from 2 to 10
# Update the cells with the new values using named arguments
self.sheet.update(range_name=cell_range, values=new_values)
print("Column H updated successfully!")
def updateSheetData(self):
#Clean the sheet
self.sheet.clear()
# Fetch the list of columns from the table
self.cursor.execute("SHOW COLUMNS FROM sheetData")
columns = self.cursor.fetchall()
excluded_columns = {'id', 'created_at', 'updated_at'}
# Extract column names excluding specified columns
column_names = [col[0] for col in columns if col[0] not in excluded_columns]
# Prepare the data to be updated in the first row
data_to_insert = [column_names]
# Fetch the list of columns from the table
self.cursor.execute("SHOW COLUMNS FROM sheetData")
columns = self.cursor.fetchall()
# Update the Google Sheet starting from cell A1
self.sheet.update('A1', data_to_insert)
# Construct the SQL query to select all columns except the excluded ones
columns_to_select = ', '.join(f"`{col}`" for col in column_names)
query = f"SELECT {columns_to_select} FROM sheetData"
# Execute the query
self.cursor.execute(query)
# Fetch all results
results = self.cursor.fetchall()
# Initialize a list to store column values
column_values = [[] for _ in column_names]
# Organize results into lists for each column
for row in results:
for i, value in enumerate(row):
column_values[i].append(value)
# Transpose the data: convert columns to rows
transposed_values = list(map(list, zip(*column_values)))
# Determine the starting row
starting_row = 2
# Update the Google Sheet starting from the specified row
cell_range = f'A{starting_row}:{chr(64 + len(transposed_values[0]))}{starting_row + len(transposed_values) - 1}'
self.sheet.update(cell_range, transposed_values)
def main(self):
# Fetch all data from the sheet
data = self.sheet.get_all_values()
# Extract column names and rows in Sheet
sheetColumns = data[0] # First row contains the column names
sheetRows = data[1:] # Subsequent rows contain the data
check_table_query = f"""
show tables like '{self.table_name}'
"""
self.cursor.execute(check_table_query)
# None when empty
table_exists = self.cursor.fetchone()
empty_string_indices = [index for index, column in enumerate(sheetColumns) if column == '']
sheetColumns = [col.strip() for col in sheetColumns if col.strip()]
sheetRows = self.sanitize_rows(sheetRows, empty_string_indices)
#If Removed all the rows in sheet remove the table
if len(sheetRows) == 0:
self.cursor.execute(f"DROP TABLE IF EXISTS {self.table_name}")
# Check if the table does not exists create a new table
elif table_exists is None:
# Drop the table if it already exists (optional)
self.cursor.execute(f"DROP TABLE IF EXISTS {self.table_name}")
# Dynamically create the table based on the column names
# Assuming all columns are VARCHAR(255)
column_definitions = "`id` INT NOT NULL AUTO_INCREMENT PRIMARY KEY, " + ", ".join(
[f"`{col}` VARCHAR(255) NOT NULL DEFAULT '' " for col in
sheetColumns]) + ", `created_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP, " \
"`updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP"
# Execute the create table query
create_table_query = f"CREATE TABLE {self.table_name} ({column_definitions})"
self.cursor.execute(create_table_query)
print(f"Table '{self.table_name}' created successfully.")
# Insert data into the newly created table
for row in sheetRows:
placeholders = ", ".join(["%s"] * len(sheetColumns)) # Creates '%s, %s, %s,...'
column_names = ", ".join(
[f"`{col}`" for col in sheetColumns]) # Wrap column names in backticks for safety
sql = f"INSERT INTO {self.table_name} ({column_names}) VALUES ({placeholders})"
# Execute the query with row values
self.cursor.execute(sql, row)
# Commit the transaction
self.db.commit()
# Close the cursor and connection
self.cursor.close()
self.db.close()
print("Data inserted successfully!")
else:
max_timestamp=self.fetch_max_time_stamp()
sheet_timeStamp=self.fetch_timestamp_sheet()
print(sheet_timeStamp,max_timestamp)
# Compare both timestamps
#if timestamp of sheet is latest update in db
if sheet_timeStamp > max_timestamp:
self.updateDb(sheetRows,sheetColumns)
#otherwise update in sheet
else:
self.updateSheetData()
self.db.close()
if __name__ == '__main__':
obj = RealTimeSync()
obj.main()
# obj.updateSheet()
# print(obj.fetch_timestamp_sheet())