Satender Product Recommendation

From SVD to Deep Learning
Published

March 7, 2022

# Download the data
#!kaggle competitions download -c santander-product-recommendation
#!mv santander-product-recommendation.zip data/
# !cd data;unzip santander-product-recommendation.zip;unzip train_ver2.csv.zip;unzip test_ver2.csv.zip
!ls -lrth data
total 2.7G
-rw-r--r-- 1 achinta achinta 106M Oct 27  2016 test_ver2.csv
-rwxrwxrwx 1 achinta achinta 2.2G Oct 27  2016 train_ver2.csv
-rw-rw-r-- 1 achinta achinta  13M Dec 11  2019 test_ver2.csv.zip
-rw-rw-r-- 1 achinta achinta 2.3M Dec 11  2019 sample_submission.csv.zip
-rw-rw-r-- 1 achinta achinta 215M Dec 11  2019 train_ver2.csv.zip
-rw-rw-r-- 1 achinta achinta 229M Mar  7 10:30 santander-product-recommendation.zip
! wc -l data/train_ver2.csv
13647310 data/train_ver2.csv

So we have 13.6M rows in the training dataset

SVD

import numpy as np
import pandas as pd
from scipy.linalg import sqrtm
from copy import deepcopy
train_10k = pd.read_csv('data/train_ver2.csv', nrows=10000)
print(f'columns- {train_10k.columns}')
columns- Index(['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
       'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel',
       'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext',
       'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'cod_prov',
       'nomprov', 'ind_actividad_cliente', 'renta', 'segmento',
       'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'],
      dtype='object')

Here are the columns we will be using in the SVD approach. We are ignoring the categorical features. (I am not sure if they can be used in SVD)

Column Description
fetcha_dato Month of Purchase
nocdepers Custmomer Code
ind__(xyz)__ult1 Products on which we have to predict sales upon
%%time
train_dates = pd.read_csv(f'data/train_ver2.csv', usecols=['fecha_dato'])
test_dates = pd.read_csv(f'data/test_ver2.csv', usecols=['fecha_dato'])
print(f'train data has {train_dates.shape[0]/1e6}M rows')
print(f'test  data has {test_dates.shape[0]/1e3}K rows')

print(f'train months {sorted(train_dates.fecha_dato.unique())}')
print(f'test months {sorted(test_dates.fecha_dato.unique())}')
train data has 13.647309M rows
test  data has 929.615K rows
train months ['2015-01-28', '2015-02-28', '2015-03-28', '2015-04-28', '2015-05-28', '2015-06-28', '2015-07-28', '2015-08-28', '2015-09-28', '2015-10-28', '2015-11-28', '2015-12-28', '2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
test months ['2016-06-28']
CPU times: user 11.6 s, sys: 676 ms, total: 12.3 s
Wall time: 12.3 s
train_10k.head(2)
fecha_dato ncodpers ind_empleado pais_residencia sexo age fecha_alta ind_nuevo antiguedad indrel ... ind_hip_fin_ult1 ind_plan_fin_ult1 ind_pres_fin_ult1 ind_reca_fin_ult1 ind_tjcr_fin_ult1 ind_valo_fin_ult1 ind_viv_fin_ult1 ind_nomina_ult1 ind_nom_pens_ult1 ind_recibo_ult1
0 2015-01-28 1375586 N ES H 35 2015-01-12 0.0 6 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0
1 2015-01-28 1050611 N ES V 23 2012-08-10 0.0 35 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0

2 rows × 48 columns

svd_cols = ['fecha_dato', 'ncodpers', 'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1','ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1','ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1','ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1','ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1','ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
%%time
train = pd.read_csv('data/train_ver2.csv', usecols=svd_cols)
print(f'train.shape - {train.shape}')
%%time

#read one month of data
train1 = train[train.fecha_dato == '2015-01-28'].drop('fecha_dato', axis=1).copy()
print(f'train1.shape - {train1.shape} and true.shape - {true.shape}')
train1.shape - (625457, 25) and true.shape - (625457, 25)
CPU times: user 695 ms, sys: 163 ms, total: 858 ms
Wall time: 851 ms
users = true['ncodpers'].tolist()
true.drop('ncodpers', axis=1, inplace=True)

items = true.columns.tolist()
print(items)

u = {}
for i in range(len(users)):
    u[users[i]] = i
['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
trueMat = np.array(true)
users = train['ncodpers'].tolist()
print(len(users))
u = {}
13647309
for i in range(len(users)):
    u[users[i]] = i
train1.index = train1['ncodpers'].tolist()
train1.drop('ncodpers', axis=1, inplace=True)
train1.head()
ind_ahor_fin_ult1 ind_aval_fin_ult1 ind_cco_fin_ult1 ind_cder_fin_ult1 ind_cno_fin_ult1 ind_ctju_fin_ult1 ind_ctma_fin_ult1 ind_ctop_fin_ult1 ind_ctpp_fin_ult1 ind_deco_fin_ult1 ... ind_hip_fin_ult1 ind_plan_fin_ult1 ind_pres_fin_ult1 ind_reca_fin_ult1 ind_tjcr_fin_ult1 ind_valo_fin_ult1 ind_viv_fin_ult1 ind_nomina_ult1 ind_nom_pens_ult1 ind_recibo_ult1
1375586 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.0 0
1050611 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.0 0
1050612 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.0 0
1050613 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0.0 0.0 0
1050614 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.0 0

5 rows × 24 columns

%%time
train1 = train1.reindex(users)
print(f'train1.shape{train1.shape}')
train1.head(3)
train1.shape(13647309, 24)
CPU times: user 2.87 s, sys: 6.77 s, total: 9.64 s
Wall time: 9.83 s
ind_ahor_fin_ult1 ind_aval_fin_ult1 ind_cco_fin_ult1 ind_cder_fin_ult1 ind_cno_fin_ult1 ind_ctju_fin_ult1 ind_ctma_fin_ult1 ind_ctop_fin_ult1 ind_ctpp_fin_ult1 ind_deco_fin_ult1 ... ind_hip_fin_ult1 ind_plan_fin_ult1 ind_pres_fin_ult1 ind_reca_fin_ult1 ind_tjcr_fin_ult1 ind_valo_fin_ult1 ind_viv_fin_ult1 ind_nomina_ult1 ind_nom_pens_ult1 ind_recibo_ult1
1375586 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1050611 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1050612 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 24 columns

svd

%%time
utilMat = np.array(train1)
mask = np.isnan(utilMat)
mask_zero = 
CPU times: user 1.37 s, sys: 11.2 s, total: 12.5 s
Wall time: 12.7 s
np.where(utilMat)
(array([       0,        1,        2, ..., 13647308, 13647308, 13647308]),
 array([ 2,  2,  2, ..., 21, 22, 23]))
print(np.sum(mask))
print(utilMat.shape[0] * utilMat.shape[1])
75917862
327535416
%%time
masked_arr=np.ma.masked_array(utilMat, mask)
item_means=np.mean(masked_arr, axis=0)
print(f'masked_arr.shape - {masked_arr.shape}')
print(f'item_means.shape - {item_means.shape}')
masked_arr.shape - (13647309, 24)
item_means.shape - (24,)
CPU times: user 2.36 s, sys: 9.37 s, total: 11.7 s
Wall time: 11.9 s
print(item_means)
[0.00014086538076379747 3.0805360857621244e-05 0.7842677689708568
 0.00048544670825167844 0.09478923983051138 0.01233997902565028
 0.013272341897923052 0.17154113021149453 0.058354032374049944
 0.0022137895858424282 0.002612161079038354 0.057973781681358506
 0.0969037465041161 0.02239120557495011 0.00805860610224633
 0.012010752692274092 0.0034949301815093824 0.060178987730119864
 0.05746153526288704 0.0327036577694172 0.005214594149384828
 0.0630621993029027 0.07005819202959135 0.1539579452461892]
masked_arr
masked_array(
  data=[[0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
        ...,
        [0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
        [--, --, --, ..., --, --, --]],
  mask=[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [ True,  True,  True, ...,  True,  True,  True]],
  fill_value=1e+20)
%%time
utilMat2 = masked_arr.filled(item_means)
print(f'utilMat2.shape - {utilMat2.shape}')
utilMat2.shape - (13647309, 24)
CPU times: user 1.73 s, sys: 7.63 s, total: 9.36 s
Wall time: 9.89 s
utilMat2
array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.40865381e-04, 3.08053609e-05, 7.84267769e-01, ...,
        6.30621993e-02, 7.00581920e-02, 1.53957945e-01]])
%%time
x = np.tile(item_means, (utilMat2.shape[0],1))
print(f'x.shape - {x.shape}')
x.shape - (13647309, 24)
CPU times: user 452 ms, sys: 5.99 s, total: 6.44 s
Wall time: 7.17 s
a = np.array([[1,0],[2,3]])
a
array([[1, 0],
       [2, 3]])
mask = np.isnan(a)
mask
array([[False, False],
       [False, False]])