-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
executable file
·107 lines (93 loc) · 3.1 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# curl \
# -H "Accept: application/vnd.github.v3+json" \
# https://api.github.com/repos/octocat/hello-world/contents/PATH
# curl \
# -H "Accept: application/vnd.github.v3+json" \
# https://api.github.com/users/alanpq/repos
#!/usr/bin/env python3
import sys
import requests
import json
import base64
import markdown as md
import os
import re
# HEAD request can be iffy, but good enough for our purposes
def is_url_image(image_url):
image_formats = ("image/png", "image/jpeg", "image/jpg", "image/gif")
r = requests.head(image_url)
if r.headers["content-type"] in image_formats:
return True
return False
imageSearch = re.compile(r"!\[.+\]\((https?:\/\/([a-z\.]+)\/?.*)\)", flags=re.IGNORECASE | re.MULTILINE)
denylist = {
"img.shields.io"
}
quitTime = False
if not "USERNAME" in os.environ:
print("'USERNAME' env var not set!")
quitTime = True
if not "TOKEN" in os.environ:
print("'TOKEN' env var not set!")
quitTime = True
if quitTime:
sys.exit(1)
data = os.environ["USERNAME"] + ":" + os.environ["TOKEN"]
headers = {
'Authorization': 'Basic ' + str(base64.b64encode(data.encode("utf-8")), "utf-8")
}
r = requests.get('https://api.github.com/users/' + os.environ["USERNAME"] + '/repos', headers=headers)
repos = r.json()
if len(sys.argv) < 2:
print("No target directory specified!")
sys.exit(1)
if "message" in repos:
print("ERROR:", repos["message"])
sys.exit(1)
for repo in repos:
print('=======[' + repo["full_name"] + ']=======')
readme = requests.get('https://api.github.com/repos/' + repo["full_name"] +'/contents/README.md', headers=headers).json()
if "message" in readme:
print("ERROR:", readme)
print()
continue
# TODO: implement this
# readme = requests.get('https://api.github.com/repos/' + repo["full_name"] +'/contents/thumbnail.png', headers=headers).json()
# if "message" in readme:
# print(repo["full_name"])
# print("ERROR:", readme)
# continue
path = os.path.join(sys.argv[1],repo["name"] + '.yml')
with open(path, 'w') as file:
markdown = str(base64.b64decode(readme["content"].replace("\\n", "")), "utf-8")
thumbnail = ""
print("[THUMBNAIL SCAN]")
imgRes = re.findall(imageSearch, markdown)
for res in imgRes:
print('"'+res[0]+'"', "is ", end="")
if res[1] in denylist:
print("in deny list.")
continue
if not is_url_image(res[0]):
print('not an image.')
continue
print("a valid thumbnail")
thumbnail = res[0]
print("[END OF THUMBNAIL SCAN]")
# print(imgRes)
splits = markdown.split('\n')
firstLine = splits[0].strip()
title = repo["name"]
print('firstLine[1:] =', firstLine[1:])
if(firstLine[0] == "#"):
title = firstLine[1:].strip()
splits = splits[1:]
desc = md.markdown("\n".join(splits)).replace("\n", "").replace("\"", "\\\"")
file.write("id: " + repo["name"])
file.write("\ntitle: " + title)
file.write("\nurl: " + repo["html_url"])
file.write("\ndescription: \"" + desc + "\"")
file.write("\nthumbnail: \"" + thumbnail + "\"")
print('\nwritten to "' + path + '".')
print()
sys.exit(0)