-
Notifications
You must be signed in to change notification settings - Fork 0
/
multiple_genres.sql
109 lines (98 loc) · 4.06 KB
/
multiple_genres.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
-- Replace single quotes with double quotes in order to use Postgres JSON functions properly
WITH temp_genres AS (
SELECT regexp_replace(genres, '''', '"', 'g') AS genres_fixed, id AS movie_id FROM tyler_burkett379.movies_metadata
),
-- Parse json strings into columns
genres_parsed AS (
SELECT arr.item_object->>'name' AS genre_name, arr.item_object->>'id' AS genre_id, movie_id
FROM temp_genres, jsonb_array_elements(genres_fixed::jsonb) with ordinality arr(item_object, position)
),
-- remove double ids
genres_parsed_except_double_id AS (SELECT * FROM genres_parsed
WHERE movie_id not IN (99080, 109962, 77221,159849,23305,14788,4912,15028,97995,
13209,5511,265189,10991, 110428, 12600, 105045, 119916, 25541, 84198,
18440, 11115, 69234, 168538, 141971, 152795, 42495, 22649,
132641, 298721)),
-- Select only 5 genres
five_genres_table AS (SELECT * FROM genres_parsed_except_double_id WHERE genre_name IN('Family', 'Comedy','Crime', 'Action','Thriller')),
-- M2M relationship table between movies and genres (to query which movies has which genres, or other way around)
movies_genres_relationship AS (
SELECT movie_id, genre_id, genre_name FROM five_genres_table
ORDER BY movie_id, genre_id
),
-- Multiple genre
multiple_genre_table AS (SELECT movie_id, count(genre_id) AS genre_num
FROM movies_genres_relationship
GROUP BY movie_id
HAVING count(genre_id) > 1),
-- Single genre
single_genre_table AS (
SELECT movie_id, count(genre_id) AS genre_num
FROM movies_genres_relationship
GROUP BY movie_id
HAVING count(genre_id) = 1
),
-- production_company_id
product_company_table AS (SELECT id as movie_id,
regexp_replace(array_to_string(regexp_matches(production_companies, '''id'': [0-9]+', 'g'), ';'), '''id'': ', '', 'g')::int
as production_company_id FROM tyler_burkett379.movies_metadata),
--- remove zero budget and zero revenue
non_zero_profit_table AS (SELECT m.id as movie_id,
status as movie_status,
revenue as Total_revenue,
budget as Total_Budget,
revenue-budget as gross_profit,
runtime,
date_part('year', release_date::date) as year_movie
FROM tyler_burkett379.movies_metadata m
WHERE budget != 0 and revenue != 0),
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
------------- MULTIPLE GENRE EXPLORATORY ANALYSIS -----------------------
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
multiple_genre_join_table AS (SELECT m.movie_id as movie_id,
p.genre_name as genre_name,
movie_status,
Total_revenue,
Total_Budget,
gross_profit,
year_movie,
(
CASE
WHEN year_movie BETWEEN '1960' AND '1969' THEN '1960s'
WHEN year_movie BETWEEN '1970' AND '1979' THEN '1970s'
WHEN year_movie BETWEEN '1980' AND '1989' THEN '1980s'
WHEN year_movie BETWEEN '1990' AND '1999' THEN '1990s'
WHEN year_movie BETWEEN '2000' AND '2009' THEN '2000s'
WHEN year_movie BETWEEN '2010' AND '2020' THEN '2010s' ELSE 'out_of_range'
END) AS decade_group,
runtime,
(CASE WHEN gross_profit > (Total_revenue * 0.50) THEN 1 ELSE 0 END) as is_profit
FROM non_zero_profit_table m
JOIN movies_genres_relationship p ON p.movie_id = m.movie_id
JOIN multiple_genre_table mg ON mg.movie_id = p.movie_id),
-- Find rows which have more than 2 genres
double_genre_tables AS (SELECT movie_id, count(movie_id) as count_num
FROM multiple_genre_join_table
GROUP BY movie_id
HAVING count(movie_id) > 2),
-- Extract only 2 genres rows
double_genres_diff_table AS (SELECT *
FROM multiple_genre_join_table r1
WHERE NOT EXISTS (SELECT FROM double_genre_tables r2 WHERE r1.movie_id = r2.movie_id)),
-- Create new column which has genre_list for double genres
genre_list_table AS (SELECT movie_id,
STRING_AGG(genre_name, '&') as genre_list
FROM double_genres_diff_table
GROUP BY movie_id)
SELECT DISTINCT ON (d1.movie_id) d1.movie_id, d2.genre_list, movie_status,
Total_revenue,
Total_Budget,
gross_profit,
year_movie,
decade_group,
runtime,
is_profit
FROM double_genres_diff_table d1
JOIN genre_list_table d2 ON d1.movie_id = d2.movie_id