import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
import numpy as np
import pandas as pd
# !wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
# !unzip ml-100k.zip
# read movie ratings
ratings_cols = ['user_id', 'item_id', 'rating', 'timestamp']
ratings_train_df = pd.read_csv('ml-100k/u1.base', sep='\t', header=None)
ratings_train_df.columns = ratings_cols
print(f'ratings_train_df.shape \t - {ratings_train_df.shape}')
ratings_test_df = pd.read_csv('ml-100k/u1.test', sep='\t', header=None)
ratings_test_df.columns = ratings_cols
print(f'ratings_test_shape \t - {ratings_test_df.shape}')
ratings_train_df.head(2)
ratings_train_df.shape - (80000, 4)
ratings_test_shape - (20000, 4)
0 |
1 |
1 |
5 |
874965758 |
1 |
1 |
2 |
3 |
876893171 |
# read movie data
movies_df = pd.read_csv('ml-100k/u.item', sep='|', header=None)
print(f'users_df.shape - {movies_df.shape}')
movie_cols = ['movie_id','title', 'release_date', 'video_release_date', 'imdb_url', 'unknown',
'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary',
'drama', 'fantasy','film-noir', 'horror', 'musical','mystery', 'romance',
'scifi', 'thriller', 'war', 'western']
movies_df.columns = movie_cols
movies_df.head(2)
users_df.shape - (1682, 24)
0 |
1 |
Toy Story (1995) |
01-Jan-1995 |
NaN |
http://us.imdb.com/M/title-exact?Toy%20Story%2... |
0 |
0 |
0 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
2 |
GoldenEye (1995) |
01-Jan-1995 |
NaN |
http://us.imdb.com/M/title-exact?GoldenEye%20(... |
0 |
1 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
2 rows × 24 columns
# read user data
users_df = pd.read_csv('ml-100k/u.user', sep='|', header=None)
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users_df.columns = user_cols
users_df.head(2)
0 |
1 |
24 |
M |
technician |
85711 |
1 |
2 |
53 |
F |
other |
94043 |
References
https://yonigottesman.github.io/recsys/pytorch/elasticsearch/2020/02/18/fm-torch-to-recsys.html