Original Data: MovieLens 100k
SQLite DB file: movielens.db.zip
To run the tests, you need download the SQLite DB file and put it in the current directory.
# download and unzip the SQLite DB file
wget https://github.com/auxten/go-ctr/files/9895974/movielens.db.zip && unzip movielens.db.zip
SQL that split training set and test set by 80% and 20% user:
-- import data from csv, do it with any tool
select count(distinct userId) from ratings; -- 610 users
create table user as select distinct userId, 0 as is_train from ratings;
-- choose 80% random user as train user
update user
set is_train = 1
where userId in
(SELECT userId
FROM (select distinct userId from ratings)
ORDER BY RANDOM()
LIMIT 488);
select count(*) from user where is_train != 1;
-- split train and test set of movielens ratings
create table ratings_train as
select r.userId, movieId, rating, timestamp
from ratings r
left join user u on r.userId = u.userId
where is_train = 1;
create table ratings_test as
select r.userId, movieId, rating, timestamp
from ratings r
left join user u on r.userId = u.userId
where is_train = 0;
select count(*) from ratings_train;
select count(*) from ratings_test;
select count(*) from ratings;
select count(distinct movieId) from movies
There is another way to split the MovieLens-20m dataset with userId that is described in the Deep Interest Network paper.
Related SQL:
create table movies
(
movieId INTEGER,
title TEXT,
genres TEXT
);
create table ratings
(
userId INTEGER,
movieId INTEGER,
rating FLOAT,
timestamp INTEGER
);
create table tags
(
userId INTEGER,
movieId INTEGER,
tag TEXT,
timestamp INTEGER
);
-- import data from csv, do it with any tool
select count(distinct userId) from ratings; -- 138,493 users
create table user as select distinct userId, 0 as is_train from ratings;
-- choose 100000 random user as train user
update user
set is_train = 1
where userId in
(SELECT userId
FROM (select distinct userId from ratings)
ORDER BY RANDOM()
LIMIT 100000);
select count(*) from user where is_train != 1; -- 38,493 test users
-- split train and test set of movielens-20m ratings
create table ratings_train as
select r.userId, movieId, rating, timestamp
from ratings r
left join user u on r.userId = u.userId
where is_train = 1;
create table ratings_test as
select r.userId, movieId, rating, timestamp
from ratings r
left join user u on r.userId = u.userId
where is_train = 0;
select count(*) from ratings_train; --14,393,526
select count(*) from ratings_test; --5,606,737
select count(*) from ratings; --20,000,263