generated from UnBParadigmas2023-2/RepositorioTemplate
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate-db.py
87 lines (76 loc) · 1.94 KB
/
generate-db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from datetime import datetime, timezone
import polars as pl
categories_df = pl.read_json("tmp/categories.json")
videos_df = pl.read_csv("tmp/videos.csv", try_parse_dates=True)
categories = (
categories_df
.select("items")
.explode("items")
.unnest("items")
.unnest("snippet")
.select(
pl.col("id").cast(pl.Int64),
pl.col("title")
)
)
videos = (
videos_df
.unique(subset=["title", "channelTitle"])
.filter(
pl.col("publishedAt").dt.year() >= 2022
)
.select(
pl.col("categoryId"),
pl.col("title").str.replace_all("'", "\\'").str.strip_chars(),
pl.col("channelTitle").str.replace_all("'", "\\'").str.strip_chars(),
pl.col("publishedAt"),
(pl.col("likes") / pl.col("view_count")).alias("lpv"),
)
)
videos_with_categories = (
videos
.join(categories, left_on="categoryId", right_on="id")
.select(
pl.all().exclude("categoryId", "title_right"),
pl.col("title_right").alias("category"),
)
)
categories_pl = [
f"category('{c}')." for (c,) in (
videos_with_categories
.select("category")
.unique()
.sort("category")
.iter_rows()
)
]
channels_pl = [
f"channel('{c}')." for (c,) in (
videos_with_categories
.select("channelTitle")
.unique()
.sort("channelTitle")
.iter_rows()
)
]
format_datetime = lambda x: x.timestamp()
videos_pl = [
f"video('{cat}', '{chan}', '{title}', {format_datetime(off)}, {lpv})."
for (cat, chan, title, off, lpv)
in (
videos_with_categories
.select("category", "channelTitle", "title", "publishedAt", "lpv")
.unique()
.sort("category", "channelTitle", "publishedAt")
.iter_rows()
)
]
print("% categorias")
print("\n".join(categories_pl))
print()
print("% canais")
print("\n".join(channels_pl))
print()
print("% videos")
print("\n".join(videos_pl))
print()