-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
132 lines (109 loc) · 4.26 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import datetime
import json
import logging
import os
import time
import uuid
from pathlib import Path
import feedparser
import requests
def construct_request_payload(
article_url: str,
labels: list[str] = [],
source: str = "api",
folder: str = "following",
) -> dict:
# make sure 'RSS' exists and there are no duplicates
labels = labels if isinstance(labels, list) else [labels]
labels = [{"name": x} for x in set(labels + ["RSS"])]
return {
"query": "mutation SaveUrl($input: SaveUrlInput!) {saveUrl(input: $input) {... on SaveSuccess {url clientRequestId} ... on SaveError {errorCodes message}}}",
"variables": {
"input": {
"clientRequestId": str(uuid.uuid4()),
"source": source,
"url": article_url,
"labels": labels,
"folder": folder,
}
},
}
def get_cache_and_feeds(cache_file: Path, feeds_file: Path) -> tuple[dict, dict]:
with feeds_file.open("r") as file:
feeds = json.load(file)
if cache_file.exists():
with cache_file.open("r") as file:
cache = json.load(file)
# Make sure new feeds exist in cache
cache.update(
{feed_title: [] for feed_title in feeds.keys() if feed_title not in cache}
)
else:
cache = {feed_title: [] for feed_title in feeds.keys()}
return cache, feeds
def parse_feed_and_add_to_omnivore(
cache: dict, feeds: dict, api_url: str, api_token: str
) -> None:
how_many_new_articles = 0
how_many_cached_articles = 0
how_many_article_errors = 0
how_many_feed_errors = 0
try:
for feed_title, feed_url in feeds.items():
try:
feed = feedparser.parse(feed_url)
article_urls = {
entry["link"]
for entry in feed.get("entries", [])
if "link" in entry
}
# raise Exception
except Exception as error:
how_many_feed_errors += 1
logging.exception(error)
continue
for article_url in article_urls:
if article_url not in cache.get(feed_title, []):
try:
# API call to omnivore to save
requests.post(
url=api_url,
json=construct_request_payload(
article_url=article_url, labels=[feed_title]
),
headers={
"content-type": "application/json",
"authorization": api_token,
},
).raise_for_status()
how_many_new_articles += 1
time.sleep(5) # be gently with omnivore
except Exception as error:
how_many_article_errors += 1
logging.exception(error)
continue
# update cache
cache[feed_title].append(article_url)
else:
how_many_cached_articles += 1
# Whatever happens make sure to keep cache file up to date and log results
finally:
with cache_file.open("w") as file:
json.dump(cache, file)
logging.info(
f"[{datetime.datetime.now().strftime('%d.%m.%Y-%H:%M:%S')}] - {how_many_cached_articles} already cached, and {how_many_new_articles} new articles."
)
# Tell outside world that there occurred errors
if how_many_article_errors > 0 or how_many_feed_errors > 0:
raise Exception(
f"{how_many_article_errors} errors when adding to Omnivore and {how_many_feed_errors} while parsing feeds."
)
if __name__ == "__main__":
api_url = os.environ["API_URL"]
api_token = os.environ["API_TOKEN"]
cache_file = Path(os.environ["CACHE_FILE"])
feeds_file = Path(os.environ["FEEDS_FILE"])
cache, feeds = get_cache_and_feeds(cache_file=cache_file, feeds_file=feeds_file)
parse_feed_and_add_to_omnivore(
cache=cache, feeds=feeds, api_url=api_url, api_token=api_token
)