-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
224 lines (182 loc) · 9.02 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import nltk
import gc
import random
import psutil
import streamlit as st
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt_tab')
stemmer = PorterStemmer()
# Cache data loading and optimize dtypes
@st.cache_data
def load_data() -> pd.DataFrame:
"""
Load anime data from a CSV file and optimize data types.
:returns: A DataFrame containing the anime data with optimized dtypes.
:rtype: pd.DataFrame
"""
dtypes = {
'title': 'category',
'other_name': 'category',
'genres': 'category',
'synopsis': 'string',
'studio': 'category',
'demographic': 'category',
'source': 'category',
'duration_category': 'category',
'total_duration_hours': 'float32',
'score': 'float32',
'image_url': 'string'
}
return pd.read_csv('data/final/AnimeData_25092024.csv', usecols=dtypes.keys(), dtype=dtypes)
stop_words = set(stopwords.words('english'))
def preprocess_text(text: str) -> str:
"""
Preprocess the input text by tokenizing, stemming, and removing stopwords.
:param text: The input text to preprocess.
:returns: The processed text as a single string after tokenization, stemming, and stopword removal.
:rtype: str
"""
tokens = word_tokenize(text.lower())
processed = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
return ' '.join(processed)
# Cache the TF-IDF vectorization and k-NN model to avoid recomputation
@st.cache_resource
def vectorize_and_build_model(df: pd.DataFrame) -> tuple[NearestNeighbors, TfidfVectorizer]:
"""
Vectorize the synopsis of the anime DataFrame and build a k-NN model.
:param df: The DataFrame containing anime data.
:return: A tuple containing the k-NN model and the TF-IDF vectorizer.
:rtype: tuple
"""
df['stemmed_synopsis'] = df['synopsis'].apply(lambda x: preprocess_text(x) if pd.notna(x) else '')
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['stemmed_synopsis'])
knn_model = NearestNeighbors(n_neighbors=5, metric='cosine').fit(tfidf_matrix)
return knn_model, tfidf_vectorizer
# Recommend anime using the k-NN model and TF-IDF vectorizer
def recommend_anime_knn(query: str, tfidf_vectorizer: TfidfVectorizer, knn_model: NearestNeighbors, top_n: int = 5) -> pd.DataFrame:
"""
Recommend anime based on a user query using the k-NN model and TF-IDF vectorization.
:param query: The user input query describing the desired anime.
:param tfidf_vectorizer: The fitted TF-IDF vectorizer.
:param knn_model: The fitted k-NN model.
:param top_n: The number of recommendations to return.
:return: A DataFrame containing the recommended anime titles and their attributes.
:rtype: pd.DataFrame
"""
query_processed = preprocess_text(query)
query_tfidf = tfidf_vectorizer.transform([query_processed])
distances, indices = knn_model.kneighbors(query_tfidf, n_neighbors=top_n + 5)
recommendations = data.iloc[indices[0]][['title', 'other_name', 'genres', 'synopsis', 'studio', 'demographic', 'source', 'duration_category', 'total_duration_hours', 'score']]
filtered_recommendations = recommendations[~recommendations['title'].str.contains(query, case=False, na=False)]
if filtered_recommendations.empty:
filtered_recommendations = recommendations
return filtered_recommendations.head(top_n)
# Full pipeline to get recommendations
def anime_recommendation_pipeline(user_query: str, top_n: int = 5) -> pd.DataFrame:
"""
Execute the full pipeline to recommend anime based on user input.
:param user_query: The user input query describing the desired anime.
:param top_n: The number of recommendations to return.
:returns: A DataFrame containing the sorted recommended anime titles based on their score.
:rtype: pd.DataFrame
"""
knn_model, tfidf_vectorizer = vectorize_and_build_model(data)
recommended_animes = recommend_anime_knn(user_query, tfidf_vectorizer, knn_model, top_n)
recommended_titles = recommended_animes['title']
recommendations = data.loc[data['title'].isin(recommended_titles)].sort_values(by='score', ascending=False)
# Free memory after processing
del recommended_animes
gc.collect()
return recommendations
# Monitor memory usage
def monitor_memory() -> None:
"""
Monitor and display the current memory usage.
:rtype: None
"""
st.write(f"Memory usage: {psutil.virtual_memory().percent}%")
gc.collect()
# Streamlit app
st.set_page_config(page_title="AniMate")
# Load custom styles
with open('styles.css') as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
# Initialize session state for navigation
if 'page' not in st.session_state:
st.session_state.page = 'landing' # Default to landing page
# Load data
data = load_data()
# Define loading phrases
loading_phrases = [
"🔍 Searching for hidden gems in the anime universe...",
"✨ Summoning the perfect anime recommendations...",
"🎉 Gathering the coolest anime just for you...",
"📚 Digging through the anime archives for you...",
"🚀 Launching into the world of anime to find your match...",
"🌟 Fetching the ultimate anime experience...",
"🌀 Sifting through dimensions for the best recommendations...",
"💫 Scouring the anime cosmos for your next favorite..."
]
# Landing Page
if st.session_state.page == 'landing':
st.title("Welcome to AniMate!")
st.caption("AniMate is a Python-based anime recommendation system that utilizes natural language processing (NLP) to suggest anime based on user preferences.")
st.caption(
"""
If you enjoy our recommendations, please consider starring our repository on GitHub ⭐!
"""
)
if st.button("Recommend Me Something"):
st.session_state.page = 'recommendations'
monitor_memory()
st.subheader("Contributors")
contributors = [
{"github": "https://github.com/Asifdotexe", "image": "https://avatars.githubusercontent.com/u/115421661?v=4", "alt_name": "Asif Sayyed"},
{"github": "https://github.com/PranjalDhamane", "image": "https://avatars.githubusercontent.com/u/131870182?v=4", "alt_name": "Pranjal Dhamane"},
{"github": "https://github.com/tanvisivaraj", "image": "https://avatars.githubusercontent.com/u/132070958?v=4", "alt_name": "Tanvi Sivaraj"},
{"github": "https://github.com/str04", "image": "https://avatars.githubusercontent.com/u/123924840?v=4", "alt_name": "Shrawani Thakur"},
{"github": "https://github.com/aditimane07", "image": "https://avatars.githubusercontent.com/u/129670339?v=4", "alt_name": "Aditi Mane"},
]
cols = st.columns(len(contributors))
for col, contributor in zip(cols, contributors):
with col:
st.markdown(f"[![Contributor Icon]({contributor['image']})]({contributor['github']})")
st.caption(contributor['alt_name'])
# Recommendations Page
else:
st.title("AniMate")
st.caption("AniMate is a Python-based anime recommendation system that utilizes natural language processing (NLP) to suggest anime based on user preferences")
monitor_memory()
query, number = st.columns([4, 1])
with query:
user_query = st.text_input("Describe a plot! Let's see if we can find something that matches that.")
with number:
num_recommendations = st.number_input("No. of results:", min_value=1, max_value=20, value=5)
if st.button("Get Recommendations"):
if user_query.strip():
st.write("### Recommendations based on your input:")
with st.spinner(random.choice(loading_phrases)):
recommended_animes = anime_recommendation_pipeline(user_query, num_recommendations)
if recommended_animes.empty:
st.warning("No recommendations found. Please try a different query.")
else:
for index, row in recommended_animes.iterrows():
with st.expander(f"**{row['title'].title()}**"):
image_column, text_column = st.columns([1, 3])
with image_column:
if pd.notna(row['image_url']):
st.image(row['image_url'], caption=row['title'].title(), width=100)
with text_column:
for column in ['other_name', 'genres', 'synopsis', 'studio', 'demographic', 'source', 'duration_category', 'total_duration_hours']:
value = row[column]
if pd.notna(value):
st.write(f"**{column.replace('_', ' ').title()}:** {value}")
else:
st.warning("Please enter a valid query to get recommendations.")