Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upload pdf to s3 #38

Closed
wants to merge 9 commits into from
129 changes: 111 additions & 18 deletions new-website/utils/tutorials/build_pdf_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- pdfunite
- pdfkit
- mdpdf
- boto3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add boto3 to requirements file


Example Usage:
- Run the script "fetch_tutorials.py" // It will fetch all the tutorials.
Expand All @@ -23,42 +24,131 @@
import pdfkit
from utils import numeric_sorter
from typing import List
import signal
import logging
import boto3
from botocore.exceptions import ClientError

INFO_PATH = "website-render-order/"
DATA_PATH = "html-notebooks/"
PDF_PATH = "storage/"

INFO_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/website-render-order/"
DATA_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/html-notebooks/"
PDF_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/storage/"

files = os.listdir(INFO_PATH)
files = sorted(files)
def timeout_handler(signum, frame):
"""
For terminating a function call.

Raises
------
Exception
If the function is called.

files_list = numeric_sorter(files)
"""
raise Exception("Coversion Timed out")


def html_to_pdf():
def html_to_pdf(data_path=DATA_PATH, info_path=INFO_PATH, pdf_path=PDF_PATH):
"""
Converts HTML files to PDF files.

Parameters
----------
data_path: str
Path of the html files to be converted. Defaults to DATA_PATH.
info_path: str
Path for Tutorial Render Order. Defaults to INFO_PATH.
pdf_path: str
Path where the converted pdf files will be stored. Defaults to PDF_PATH.

Raises
------
ProtocolUnknownError
If it faces some unknown kind of graphic.
IOError
If the file specified in the website-render-order is not present in /html-notebooks.
Exception
If the Conversion takes longer than 60 seconds.

"""
files = os.listdir(info_path)
files = sorted(files)
files_list = numeric_sorter(files)

for i in files_list:
chapter = pd.read_csv(INFO_PATH + "-".join(i))
chapter = pd.read_csv(info_path + "-".join(i))
for j in chapter["File Name"]:
print(i, j)
pdfkit.from_file(DATA_PATH + j[:-5] + "html", PDF_PATH + j[:-5] + "pdf")
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(60)
try:
print(i, j)
pdfkit.from_file(data_path + j[:-5] + "html", pdf_path + j[:-5] + "pdf")
print("Conversion Successful")
except Exception as e:
print("Exception occured: {}".format(e))


def upload_file(file_name, bucket, object_name=None):
"""
Upload a file to an S3 bucket

Parameters
----------
file_name: str
Path of the File to be uploaded.
bucket: str
Name of the Bucket to upload the file to.
object_name: str
S3 object name. If not specified then file_name is used.

Returns
-------
boolean:
True if file was uploaded, else False

"""

# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = os.path.basename(file_name)

# Upload the file
s3_client = boto3.client('s3')
try:
response = s3_client.upload_file(file_name, bucket, object_name)
except ClientError as e:
logging.error(e)
return False
return True


def merge_pdf(info_path=INFO_PATH, pdf_path=PDF_PATH):
"""
Merges the compiled PDFs.

Parameters
----------
info_path: str
Path for Tutorial Render Order. Defaults to INFO_PATH.
pdf_path: str
Path where the merged pdf file will be stored. Defaults to PDF_PATH.

"""
files = os.listdir(info_path)
files = sorted(files)

files_list = numeric_sorter(files)

def merge_pdf():
"""Merges the compiled PDFs."""
command = "pdfunite "
for i in files_list:
chapter = pd.read_csv(INFO_PATH + "-".join(i))
print(i)
chapter = pd.read_csv(info_path + "-".join(i))
for j in chapter["File Name"]:
print(i, j)
command = command + PDF_PATH + j[:-5] + "pdf "
os.system(command + "merged.pdf")
file_path = pdf_path + j[:-5] + "pdf"
if (os.path.exists(file_path)):
print(i, j)
command = command + pdf_path + j[:-5] + "pdf "
os.system(command + f"{pdf_path}merged.pdf")


def merge_pdf_pages(a: List[str]):
"""Merges the PDFs.
Expand All @@ -73,7 +163,8 @@ def merge_pdf_pages(a: List[str]):
command = "pdfunite "
for i in a:
command = command + i + ' '
os.system(command + "storage/merged.pdf")
os.system(command + "storage/full_pdf.pdf")


def compile_information_pages():
"""Converts the Acknowledgent page and content page from
Expand All @@ -85,9 +176,11 @@ def compile_information_pages():
pdfkit.from_file('contents.html', 'storage/contents.pdf')
pdfkit.from_file('acknowledgement.html', 'storage/acknowledgement.pdf')


if __name__ == "__main__":
os.system("mkdir " + PDF_PATH)
html_to_pdf()
merge_pdf()
compile_information_pages()
merge_pdf_pages(['storage/title.pdf', 'storage/acknowledgement.pdf', 'storage/contents.pdf', 'storage/full_pdf.pdf'])
merge_pdf_pages(['storage/title.pdf', 'storage/acknowledgement.pdf', 'storage/contents.pdf', 'storage/merged.pdf'])
upload_file('storage/full_pdf.pdf', 'deepchemtutorials', 'TutorialsBook.pdf')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls, keep an empy line at the end

Loading