Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upload pdf to s3 #38

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions .github/workflows/build_pdf_book.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: Build latest version of PDF Book

on:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good, can you please explain when exactly this workflow will get triggered.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the workflow will be triggered whenever new commits are performed in examples/tutorials directory in deepchem repository.

workflow_dispatch:
repository_dispatch:
types: [rebuild-book]

permissions:
contents: write

jobs:
build_pdf_book:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: install requirements
run: |
cd new-website
cd utils
pip install -r requirements.txt
sudo apt-get install -y poppler-utils
sudo apt-get install -y wkhtmltopdf

- name: fetch latest version of tutorials
run: |
sudo apt-get install jq
cd new-website
cd utils/tutorials
python3 fetch_tutorials.py

- name: build pdf book
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
cd new-website
cd utils/tutorials
python3 build_pdf_book.py


8 changes: 8 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ jobs:
cd new-website
cd utils
pip install -r requirements.txt
sudo apt-get install -y poppler-utils
sudo apt-get install -y wkhtmltopdf
- name: Test tutorial fetching and export
run: |
Expand All @@ -37,4 +39,10 @@ jobs:
cd utils/tutorials
python3 test_utils.py
- name: Test tutorials build pdf book functions
run: |
cd new-website
cd utils/tutorials
python3 test_build_pdf_book.py
Binary file modified new-website/utils/requirements.txt
Binary file not shown.
130 changes: 112 additions & 18 deletions new-website/utils/tutorials/build_pdf_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- pdfunite
- pdfkit
- mdpdf
- boto3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add boto3 to requirements file


Example Usage:
- Run the script "fetch_tutorials.py" // It will fetch all the tutorials.
Expand All @@ -23,42 +24,131 @@
import pdfkit
from utils import numeric_sorter
from typing import List
import signal
import logging
import boto3
from botocore.exceptions import ClientError

INFO_PATH = "website-render-order/"
DATA_PATH = "html-notebooks/"
PDF_PATH = "storage/"

INFO_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/website-render-order/"
DATA_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/html-notebooks/"
PDF_PATH = "/workspaces/deepchem.github.io/new-website/utils/tutorials/storage/"

files = os.listdir(INFO_PATH)
files = sorted(files)
def timeout_handler(signum, frame):
"""
For terminating a function call.

Raises
------
Exception
If the function is called.

files_list = numeric_sorter(files)
"""
raise Exception("Coversion Timed out")


def html_to_pdf():
def html_to_pdf(data_path=DATA_PATH, info_path=INFO_PATH, pdf_path=PDF_PATH):
"""
Converts HTML files to PDF files.

Parameters
----------
data_path: str
Path of the html files to be converted. Defaults to DATA_PATH.
info_path: str
Path for Tutorial Render Order. Defaults to INFO_PATH.
pdf_path: str
Path where the converted pdf files will be stored. Defaults to PDF_PATH.

Raises
------
ProtocolUnknownError
If it faces some unknown kind of graphic.
IOError
If the file specified in the website-render-order is not present in /html-notebooks.
Exception
If the Conversion takes longer than 60 seconds.

"""
files = os.listdir(info_path)
files = sorted(files)
files_list = numeric_sorter(files)

for i in files_list:
chapter = pd.read_csv(INFO_PATH + "-".join(i))
chapter = pd.read_csv(info_path + "-".join(i))
for j in chapter["File Name"]:
print(i, j)
pdfkit.from_file(DATA_PATH + j[:-5] + "html", PDF_PATH + j[:-5] + "pdf")
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(60)
try:
print(i, j)
pdfkit.from_file(data_path + j[:-5] + "html", pdf_path + j[:-5] + "pdf")
print("Conversion Successful")
except Exception as e:
print("Exception occured: {}".format(e))


def upload_file(file_name, bucket, object_name=None):
"""
Upload a file to an S3 bucket

Parameters
----------
file_name: str
Path of the File to be uploaded.
bucket: str
Name of the Bucket to upload the file to.
object_name: str
S3 object name. If not specified then file_name is used.

Returns
-------
boolean:
True if file was uploaded, else False

"""

# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = os.path.basename(file_name)

# Upload the file
s3_client = boto3.client('s3')
try:
response = s3_client.upload_file(file_name, bucket, object_name)
except ClientError as e:
logging.error(e)
return False
return True


def merge_pdf(info_path=INFO_PATH, pdf_path=PDF_PATH):
"""
Merges the compiled PDFs.

Parameters
----------
info_path: str
Path for Tutorial Render Order. Defaults to INFO_PATH.
pdf_path: str
Path where the merged pdf file will be stored. Defaults to PDF_PATH.

"""
files = os.listdir(info_path)
files = sorted(files)

files_list = numeric_sorter(files)

def merge_pdf():
"""Merges the compiled PDFs."""
command = "pdfunite "
for i in files_list:
chapter = pd.read_csv(INFO_PATH + "-".join(i))
print(i)
chapter = pd.read_csv(info_path + "-".join(i))
for j in chapter["File Name"]:
print(i, j)
command = command + PDF_PATH + j[:-5] + "pdf "
os.system(command + "merged.pdf")
file_path = pdf_path + j[:-5] + "pdf"
if (os.path.exists(file_path)):
print(i, j)
command = command + pdf_path + j[:-5] + "pdf "
os.system(command + f"{pdf_path}merged.pdf")


def merge_pdf_pages(a: List[str]):
"""Merges the PDFs.
Expand All @@ -73,7 +163,8 @@ def merge_pdf_pages(a: List[str]):
command = "pdfunite "
for i in a:
command = command + i + ' '
os.system(command + "storage/merged.pdf")
os.system(command + "storage/full_pdf.pdf")


def compile_information_pages():
"""Converts the Acknowledgent page and content page from
Expand All @@ -85,9 +176,12 @@ def compile_information_pages():
pdfkit.from_file('contents.html', 'storage/contents.pdf')
pdfkit.from_file('acknowledgement.html', 'storage/acknowledgement.pdf')


if __name__ == "__main__":
os.system("mkdir " + PDF_PATH)
html_to_pdf()
merge_pdf()
compile_information_pages()
merge_pdf_pages(['storage/title.pdf', 'storage/acknowledgement.pdf', 'storage/contents.pdf', 'storage/full_pdf.pdf'])
merge_pdf_pages(['storage/title.pdf', 'storage/acknowledgement.pdf', 'storage/contents.pdf', 'storage/merged.pdf'])
upload_file('storage/full_pdf.pdf', 'deepchemtutorials', 'TutorialsBook.pdf')

Loading
Loading