-
Notifications
You must be signed in to change notification settings - Fork 0
/
merging-tables-with-diff-types.py
86 lines (67 loc) · 4.51 KB
/
merging-tables-with-diff-types.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Merge the movies table, as the left table, with the financials table using a left join, and save the result to movies_financials.
movies_financials = movies.merge(financials,how='left')
# Count the number of rows in movies_financials with a null value in the budget column.
movies_financials = movies.merge(financials, on='id', how='left')
# Count the number of rows in the budget column that are missing
number_of_missing_fin = movies_financials['budget'].isnull().sum()
# Print the number of movies missing financials
print(number_of_missing_fin)
# Merge toy_story and taglines on the id column with a left join, and save the result as toystory_tag
toystory_tag = toy_story.merge(taglines,on='id',how='left')
# Merge the toy_story and taglines tables with a inner join
toystory_tag = toy_story.merge(taglines,on='id')
# Merge action_movies to the scifi_movies with right join
action_scifi = action_movies.merge(scifi_movies, on='movie_id', how='right',
suffixes=('_act','_sci'))
# From action_scifi, select only the rows where the genre_act column is null
scifi_only = action_scifi[action_scifi['genre_act'].isnull()]
# Merge the movies and scifi_only tables with an inner join
movies_and_scifi_only = movies.merge(scifi_only,left_on='id',right_on='movie_id')
# Print the first few rows and shape of movies_and_scifi_only
print(movies_and_scifi_only.head())
print(movies_and_scifi_only.shape)
# Merge movie_to_genres and pop_movies using a right join. Save the results as genres_movies.
genres_movies = movie_to_genres.merge(pop_movies, how='right',
left_on='movie_id',
right_on='id')
# Group genres_movies by genre and count the number of id values
genre_count = genres_movies.groupby('genre').agg({'id':'count'})
# Plot a bar chart of the genre_count
genre_count.plot(kind='bar')
plt.show()
# Save to iron_1_and_2 the merge of iron_1_actors (left) with iron_2_actors tables with an outer join on the id column, and set suffixes to ('_1','_2').
iron_1_and_2 = iron_1_actors.merge(iron_2_actors,
how='outer',
on='id',
suffixes=('_1','_2'))
# Create an index that returns True if name_1 or name_2 are null, and False otherwise
m = ((iron_1_and_2['name_1'].isnull()) |
(iron_1_and_2['name_2'].isnull()))
# Print the first few rows of iron_1_and_2
print(iron_1_and_2[m].head())
# To a variable called crews_self_merged, merge the crews table to itself on the id column using an inner join, setting the suffixes to '_dir' and '_crew' for the left and right tables respectively.
crews_self_merged = crews.merge(crews,on='id',suffixes=('_dir','_crew'))
# Create a Boolean index, named boolean_filter, that selects rows from the left table with the job of 'Director' and avoids rows with the job of 'Director' in the right table.
# Merge the crews table to itself
crews_self_merged = crews.merge(crews, on='id', how='inner',
suffixes=('_dir','_crew'))
# Create a Boolean index to select the appropriate
boolean_filter = ((crews_self_merged['job_dir'] == 'Director') &
(crews_self_merged['job_crew'] != 'Director'))
direct_crews = crews_self_merged[boolean_filter]
print(direct_crews.head())
# Merge to the movies table the ratings table on the index
movies_ratings = movies.merge(ratings,on='id')
print(movies_ratings.head())
# With the sequels table on the left, merge to it the financials table on index named id, ensuring that all the rows from the sequels are returned and some rows from the other table may not be returned, Save the results to sequels_fin.
sequels_fin = sequels.merge(financials,on='id',how='left')
# Merge the sequels_fin table to itself with an inner join, where the left and right tables merge on sequel and id respectively with suffixes equal to ('_org','_seq'), saving to orig_seq
orig_seq = sequels_fin.merge(sequels_fin, how='inner', left_on='sequel',
right_on='id', right_index=True,
suffixes=('_org','_seq'))
# Add calculation to subtract revenue_org from revenue_seq
orig_seq['diff'] = orig_seq['revenue_seq'] - orig_seq['revenue_org']
# Select the title_org, title_seq, and diff
titles_diff = orig_seq[['title_org','title_seq','diff']]
# Print the first rows of the sorted titles_diff
print(titles_diff.sort_values('diff',ascending=False).head())