generated from amosproj/amos202Xss0Y-projname
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Download Google Docs Content
- Loading branch information
Showing
9 changed files
with
215 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
76 changes: 76 additions & 0 deletions
76
src/backend/RAG/LangChain_Implementation/get_google_docs.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import io | ||
import os | ||
import pickle | ||
import re | ||
from pathlib import Path | ||
|
||
from google.auth.transport.requests import Request | ||
from google_auth_oauthlib.flow import InstalledAppFlow | ||
from googleapiclient.discovery import build | ||
from googleapiclient.http import MediaIoBaseDownload | ||
|
||
def extract_document_id_from_url(url): | ||
pattern = r'[A-Za-z0-9]*' | ||
matches = re.findall(pattern, url) | ||
document_id = max(matches, key=len) | ||
return document_id | ||
|
||
def authenticate(credentials, scopes): | ||
"""Obtaining auth with needed apis""" | ||
creds = None | ||
# The file token.pickle stores the user's access | ||
# and refresh tokens, and is created automatically | ||
# when the authorization flow completes for the first time. | ||
if os.path.exists('token.pickle'): | ||
with open('token.pickle', 'rb') as token: | ||
creds = pickle.load(token) | ||
# If there are no (valid) credentials available, let the user log in. | ||
if not creds or not creds.valid: | ||
if creds and creds.expired and creds.refresh_token: | ||
creds.refresh(Request()) | ||
else: | ||
flow = InstalledAppFlow.from_client_secrets_file(credentials, scopes) | ||
creds = flow.run_local_server(port=0) | ||
# Save the credentials for the next run | ||
with open('token.pickle', 'wb') as token: | ||
pickle.dump(creds, token) | ||
|
||
return creds | ||
|
||
|
||
def download_file(file_id, credentials_path, file_name): | ||
scopes = ['https://www.googleapis.com/auth/drive.readonly'] | ||
credentials = authenticate(credentials_path, scopes) | ||
drive_service = build('drive', 'v3', credentials=credentials) | ||
|
||
# Export the Google Docs file as plain text | ||
export_mime_type = 'text/plain' | ||
request = drive_service.files().export_media(fileId=file_id, mimeType=export_mime_type) | ||
|
||
# Create a file on disk to write the exported content | ||
fh = io.FileIO(file_name, 'wb') | ||
downloader = MediaIoBaseDownload(fh, request) | ||
done = False | ||
while not done: | ||
status, done = downloader.next_chunk() | ||
print(f'Download {int(status.progress() * 100)}%.') | ||
|
||
# Read the content of the exported file | ||
with open(file_name, 'r', encoding='utf-8') as file: | ||
content = file.read() | ||
|
||
return content | ||
|
||
|
||
# Example usage | ||
document_id = extract_document_id_from_url("https://docs.google.com/document/d/1xrfrwyRCTrxiCupiKSSFgKUxiCTXgr45gPJYybnY23w/edit") | ||
credentials_json = 'credentials.json' | ||
|
||
# Define the file path in a cross-platform manner | ||
file_name = Path('data') / 'google_docs_content.txt' | ||
file_name.parent.mkdir(parents=True, exist_ok=True) | ||
|
||
# TODO: make this callable from typescript with url | ||
|
||
content = download_file(document_id, credentials_json, file_name) | ||
print(content) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters