Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

closes #12 #14

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
pandas==2.2.3
scikit-learn
tqdm
6 changes: 5 additions & 1 deletion run_all_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
print(os.path.basename(__file__), f'os.getcwd(): {os.getcwd()}')
import step1_prepare.step1_1_download_data
#import step1_prepare.step1_2_preprocess_data
#import step1_prepare.step1_3_split_data

# Step 1.3 Split Data
import step1_prepare.step1_3_split_data
step1_prepare.step1_3_split_data.split_multiple_files(input_directory='.')


print('\n*** Step 2. Train Model 🌏🚀 ***')
#os.chdir('../step2_train')
Expand Down
15 changes: 15 additions & 0 deletions step1_prepare/split/test_step1_1_storybook_learning_events.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
id,timestamp,android_id,package_name,storybook_id,storybook_title,learning_event_type
233,632902112,f94ac8506e31b8d2,ai.elimu.vitabu,29.0,"आज, मैं हूँ...",STORYBOOK_OPENED
203,1726876592,f94ac8506e31b8d2,ai.elimu.vitabu,61.0,कुत्ते के अंडे,STORYBOOK_OPENED
191,-94971816,f94ac8506e31b8d2,ai.elimu.vitabu,15.0,उड़ने वाला ऑटो,STORYBOOK_OPENED
196,486526184,f94ac8506e31b8d2,ai.elimu.vitabu,15.0,उड़ने वाला ऑटो,STORYBOOK_OPENED
215,1622838296,f94ac8506e31b8d2,ai.elimu.vitabu,51.0,घूम-घूम घड़ियाल का अनोखा सफ़र,STORYBOOK_OPENED
228,1644826296,f94ac8506e31b8d2,ai.elimu.vitabu,11.0,गप्पू नाच नहीं सकती,STORYBOOK_OPENED
245,1928921816,f94ac8506e31b8d2,ai.elimu.vitabu,30.0,मलार का बड़ा सा घर,STORYBOOK_OPENED
192,-95016816,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
248,-2011856480,f94ac8506e31b8d2,ai.elimu.vitabu,37.0,अद्भुत कीड़े,STORYBOOK_OPENED
199,1855070888,f94ac8506e31b8d2,ai.elimu.vitabu,5.0,बनबिलाव! बनबिलाव!,STORYBOOK_OPENED
212,366023296,f94ac8506e31b8d2,ai.elimu.vitabu,37.0,अद्भुत कीड़े,STORYBOOK_OPENED
252,303986520,467ab5528a9f4f82,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
234,632748112,f94ac8506e31b8d2,ai.elimu.vitabu,29.0,"आज, मैं हूँ...",STORYBOOK_OPENED
187,-94670816,f94ac8506e31b8d2,ai.elimu.vitabu,38.0,हमारे मित्र कौन है?,STORYBOOK_OPENED
13 changes: 13 additions & 0 deletions step1_prepare/split/test_step1_1_storybooks.csv

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions step1_prepare/split/train_step1_1_storybook_learning_events.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
id,timestamp,android_id,package_name,storybook_id,storybook_title,learning_event_type
241,1278423112,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
242,1278181112,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
236,632620112,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
194,93390184,f94ac8506e31b8d2,ai.elimu.vitabu,40.0,सूरज का दोस्त कौन ?,STORYBOOK_OPENED
229,-1845847000,f94ac8506e31b8d2,ai.elimu.vitabu,63.0,आनंद,STORYBOOK_OPENED
218,1622728296,f94ac8506e31b8d2,ai.elimu.vitabu,30.0,मलार का बड़ा सा घर,STORYBOOK_OPENED
223,1622601296,f94ac8506e31b8d2,ai.elimu.vitabu,53.0,एक सौ सैंतीसवाँ पैर,STORYBOOK_OPENED
206,1876735592,f94ac8506e31b8d2,ai.elimu.vitabu,,रमाइलो दिन,STORYBOOK_OPENED
232,632928112,f94ac8506e31b8d2,ai.elimu.vitabu,23.0,स्वतंत्रता की ओर,STORYBOOK_OPENED
220,1622661296,f94ac8506e31b8d2,ai.elimu.vitabu,23.0,स्वतंत्रता की ओर,STORYBOOK_OPENED
235,632718112,f94ac8506e31b8d2,ai.elimu.vitabu,29.0,"आज, मैं हूँ...",STORYBOOK_OPENED
217,1622763296,f94ac8506e31b8d2,ai.elimu.vitabu,40.0,सूरज का दोस्त कौन ?,STORYBOOK_OPENED
200,1854631888,f94ac8506e31b8d2,ai.elimu.vitabu,2.0,आलू-मालू-कालू,STORYBOOK_OPENED
250,-2013353480,f94ac8506e31b8d2,ai.elimu.vitabu,66.0,"एक सफ़र, एक खेल",STORYBOOK_OPENED
227,1644842296,f94ac8506e31b8d2,ai.elimu.vitabu,41.0,राजू की पहली हवाई-यात्रा,STORYBOOK_OPENED
190,-94966816,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
204,1877244592,f94ac8506e31b8d2,ai.elimu.vitabu,11.0,गप्पू नाच नहीं सकती,STORYBOOK_OPENED
221,1622635296,f94ac8506e31b8d2,ai.elimu.vitabu,41.0,राजू की पहली हवाई-यात्रा,STORYBOOK_OPENED
195,93373184,f94ac8506e31b8d2,ai.elimu.vitabu,41.0,राजू की पहली हवाई-यात्रा,STORYBOOK_OPENED
231,632993112,f94ac8506e31b8d2,ai.elimu.vitabu,23.0,स्वतंत्रता की ओर,STORYBOOK_OPENED
193,93411184,f94ac8506e31b8d2,ai.elimu.vitabu,39.0,बंटी और उसके गाते हुए पक्षी,STORYBOOK_OPENED
243,1497538816,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
253,2096156928,f94ac8506e31b8d2,ai.elimu.vitabu,5.0,बनबिलाव! बनबिलाव!,STORYBOOK_OPENED
202,327424592,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
214,1622882296,f94ac8506e31b8d2,ai.elimu.vitabu,27.0,तारा की गगनचुंबी यात्रा,STORYBOOK_OPENED
213,1622954296,f94ac8506e31b8d2,ai.elimu.vitabu,27.0,तारा की गगनचुंबी यात्रा,STORYBOOK_OPENED
211,366029296,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
254,2095900928,f94ac8506e31b8d2,ai.elimu.vitabu,30.0,मलार का बड़ा सा घर,STORYBOOK_OPENED
198,507708184,e142205d609d6032,ai.elimu.vitabu,49.0,लाल बरसाती,STORYBOOK_OPENED
219,1622690296,f94ac8506e31b8d2,ai.elimu.vitabu,5.0,बनबिलाव! बनबिलाव!,STORYBOOK_OPENED
251,1929551816,467ab5528a9f4f82,ai.elimu.vitabu,30.0,मलार का बड़ा सा घर,STORYBOOK_OPENED
237,632575112,f94ac8506e31b8d2,ai.elimu.vitabu,37.0,अद्भुत कीड़े,STORYBOOK_OPENED
224,1622570296,f94ac8506e31b8d2,ai.elimu.vitabu,11.0,गप्पू नाच नहीं सकती,STORYBOOK_OPENED
216,1622808296,f94ac8506e31b8d2,ai.elimu.vitabu,55.0,जादुर्इ गुटका,STORYBOOK_OPENED
230,-2002281592,f94ac8506e31b8d2,ai.elimu.vitabu,29.0,"आज, मैं हूँ...",STORYBOOK_OPENED
240,1278804112,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
188,-94702816,f94ac8506e31b8d2,ai.elimu.vitabu,41.0,राजू की पहली हवाई-यात्रा,STORYBOOK_OPENED
208,1874087592,f94ac8506e31b8d2,ai.elimu.vitabu,,रमाइलो दिन,STORYBOOK_OPENED
189,-94961816,f94ac8506e31b8d2,ai.elimu.vitabu,10.0,रिमझिम बरसे बादल,STORYBOOK_OPENED
249,-2013290480,f94ac8506e31b8d2,ai.elimu.vitabu,66.0,"एक सफ़र, एक खेल",STORYBOOK_OPENED
226,1644861296,f94ac8506e31b8d2,ai.elimu.vitabu,22.0,मुत्तज्जी की उम्र क्या है?,STORYBOOK_OPENED
222,1622614296,f94ac8506e31b8d2,ai.elimu.vitabu,15.0,उड़ने वाला ऑटो,STORYBOOK_OPENED
239,1278808112,f94ac8506e31b8d2,ai.elimu.vitabu,37.0,अद्भुत कीड़े,STORYBOOK_OPENED
210,366039296,f94ac8506e31b8d2,ai.elimu.vitabu,2.0,आलू-मालू-कालू,STORYBOOK_OPENED
246,1928872816,f94ac8506e31b8d2,ai.elimu.vitabu,30.0,मलार का बड़ा सा घर,STORYBOOK_OPENED
197,507715184,e142205d609d6032,ai.elimu.vitabu,48.0,ग़ोलू एक ग़ोल कि कहानी,STORYBOOK_OPENED
209,-256646704,f94ac8506e31b8d2,ai.elimu.vitabu,29.0,"आज, मैं हूँ...",STORYBOOK_OPENED
205,1876753592,f94ac8506e31b8d2,ai.elimu.vitabu,,रमाइलो दिन,STORYBOOK_OPENED
244,1497521816,f94ac8506e31b8d2,ai.elimu.vitabu,37.0,अद्भुत कीड़े,STORYBOOK_OPENED
225,1622501296,f94ac8506e31b8d2,ai.elimu.vitabu,29.0,"आज, मैं हूँ...",STORYBOOK_OPENED
207,1874776592,f94ac8506e31b8d2,ai.elimu.vitabu,,रमाइलो दिन,STORYBOOK_OPENED
247,-2011834480,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
201,1853764888,f94ac8506e31b8d2,ai.elimu.vitabu,1.0,"अभी नहीं, अभी नहीं!",STORYBOOK_OPENED
238,1278861112,f94ac8506e31b8d2,ai.elimu.vitabu,37.0,अद्भुत कीड़े,STORYBOOK_OPENED
47 changes: 47 additions & 0 deletions step1_prepare/split/train_step1_1_storybooks.csv

Large diffs are not rendered by default.

60 changes: 60 additions & 0 deletions step1_prepare/step1_3_split_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def split_multiple_files(input_directory, train_ratio=0.8, chunk_size=50000):
# Check directory existence and readability
if not os.path.exists(input_directory) or not os.access(input_directory, os.R_OK):
raise ValueError(f"Invalid or unreadable directory: {input_directory}")

# Get list of CSV files
csv_files = [f for f in os.listdir(input_directory) if f.lower().endswith('.csv')]
if not csv_files:
raise ValueError(f"No CSV files found in {input_directory}.")

output_dir = os.path.join(input_directory, 'split')
os.makedirs(output_dir, exist_ok=True)

# Process files with progress bar
with tqdm(total=len(csv_files), desc="Processing files") as pbar:
for file in csv_files:
file_path = os.path.join(input_directory, file)

if os.stat(file_path).st_size == 0:
print(f"Skipping empty file: {file}")
pbar.update(1)
continue

try:
# Initialize data lists
train_data, test_data = [], []
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
chunk = optimize_memory(chunk)
train, test = train_test_split(chunk, test_size=1-train_ratio, random_state=42)
train_data.append(train)
test_data.append(test)

# Save splits
pd.concat(train_data).to_csv(os.path.join(output_dir, f"train_{file}"), index=False)
pd.concat(test_data).to_csv(os.path.join(output_dir, f"test_{file}"), index=False)
print(f"Processed {file} successfully.")

except Exception as e:
print(f"Error processing {file}: {e}")

pbar.update(1)

def optimize_memory(data):
"""Convert columns to more memory-efficient types."""
for col in data.select_dtypes(include=['float64']).columns:
data[col] = data[col].astype('float32')
for col in data.select_dtypes(include=['int64']).columns:
data[col] = data[col].astype('int32')
return data

if __name__ == "__main__":
try:
split_multiple_files(input_directory='/Users/nitikabahl/story recemonder/ml-storybook-recommender/step1_prepare')
except Exception as e:
print(f"Error: {e}")
Loading