Satender Product Recommendation

# Download the data
#!kaggle competitions download -c santander-product-recommendation
#!mv santander-product-recommendation.zip data/
# !cd data;unzip santander-product-recommendation.zip;unzip train_ver2.csv.zip;unzip test_ver2.csv.zip
!ls -lrth data

total 2.7G
-rw-r--r-- 1 achinta achinta 106M Oct 27  2016 test_ver2.csv
-rwxrwxrwx 1 achinta achinta 2.2G Oct 27  2016 train_ver2.csv
-rw-rw-r-- 1 achinta achinta  13M Dec 11  2019 test_ver2.csv.zip
-rw-rw-r-- 1 achinta achinta 2.3M Dec 11  2019 sample_submission.csv.zip
-rw-rw-r-- 1 achinta achinta 215M Dec 11  2019 train_ver2.csv.zip
-rw-rw-r-- 1 achinta achinta 229M Mar  7 10:30 santander-product-recommendation.zip

! wc -l data/train_ver2.csv

13647310 data/train_ver2.csv

So we have 13.6M rows in the training dataset

SVD

import numpy as np
import pandas as pd
from scipy.linalg import sqrtm
from copy import deepcopy

train_10k = pd.read_csv('data/train_ver2.csv', nrows=10000)
print(f'columns- {train_10k.columns}')

columns- Index(['fecha_dato', 'ncodpers', 'ind_empleado', 'pais_residencia', 'sexo',
       'age', 'fecha_alta', 'ind_nuevo', 'antiguedad', 'indrel',
       'ult_fec_cli_1t', 'indrel_1mes', 'tiprel_1mes', 'indresi', 'indext',
       'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'cod_prov',
       'nomprov', 'ind_actividad_cliente', 'renta', 'segmento',
       'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'],
      dtype='object')

Here are the columns we will be using in the SVD approach. We are ignoring the categorical features. (I am not sure if they can be used in SVD)

Column	Description
fetcha_dato	Month of Purchase
nocdepers	Custmomer Code
ind__(xyz)__ult1	Products on which we have to predict sales upon

%%time
train_dates = pd.read_csv(f'data/train_ver2.csv', usecols=['fecha_dato'])
test_dates = pd.read_csv(f'data/test_ver2.csv', usecols=['fecha_dato'])
print(f'train data has {train_dates.shape[0]/1e6}M rows')
print(f'test  data has {test_dates.shape[0]/1e3}K rows')

print(f'train months {sorted(train_dates.fecha_dato.unique())}')
print(f'test months {sorted(test_dates.fecha_dato.unique())}')

train data has 13.647309M rows
test  data has 929.615K rows
train months ['2015-01-28', '2015-02-28', '2015-03-28', '2015-04-28', '2015-05-28', '2015-06-28', '2015-07-28', '2015-08-28', '2015-09-28', '2015-10-28', '2015-11-28', '2015-12-28', '2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
test months ['2016-06-28']
CPU times: user 11.6 s, sys: 676 ms, total: 12.3 s
Wall time: 12.3 s

train_10k.head(2)

	fecha_dato	ncodpers	ind_empleado	pais_residencia	sexo	age	fecha_alta	ind_nuevo	antiguedad	indrel	...	ind_hip_fin_ult1	ind_plan_fin_ult1	ind_pres_fin_ult1	ind_reca_fin_ult1	ind_tjcr_fin_ult1	ind_valo_fin_ult1	ind_viv_fin_ult1	ind_nomina_ult1	ind_nom_pens_ult1	ind_recibo_ult1
0	2015-01-28	1375586	N	ES	H	35	2015-01-12	0.0	6	1.0	...	0	0	0	0	0	0	0	0.0	0.0	0
1	2015-01-28	1050611	N	ES	V	23	2012-08-10	0.0	35	1.0	...	0	0	0	0	0	0	0	0.0	0.0	0

2 rows × 48 columns

svd_cols = ['fecha_dato', 'ncodpers', 'ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1','ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1','ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1','ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1','ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1','ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

%%time
train = pd.read_csv('data/train_ver2.csv', usecols=svd_cols)
print(f'train.shape - {train.shape}')

%%time

#read one month of data
train1 = train[train.fecha_dato == '2015-01-28'].drop('fecha_dato', axis=1).copy()
print(f'train1.shape - {train1.shape} and true.shape - {true.shape}')

train1.shape - (625457, 25) and true.shape - (625457, 25)
CPU times: user 695 ms, sys: 163 ms, total: 858 ms
Wall time: 851 ms

users = true['ncodpers'].tolist()
true.drop('ncodpers', axis=1, inplace=True)

items = true.columns.tolist()
print(items)

u = {}
for i in range(len(users)):
    u[users[i]] = i

['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

trueMat = np.array(true)

users = train['ncodpers'].tolist()
print(len(users))
u = {}

13647309

for i in range(len(users)):
    u[users[i]] = i

train1.index = train1['ncodpers'].tolist()
train1.drop('ncodpers', axis=1, inplace=True)
train1.head()

	ind_cco_fin_ult1	ind_deco_fin_ult1	...
1375586	1	0	...
1050611	1	0	...
1050612	1	0	...
1050613	0	1	...
1050614	1	0	...

5 rows × 24 columns

%%time
train1 = train1.reindex(users)
print(f'train1.shape{train1.shape}')
train1.head(3)

train1.shape(13647309, 24)
CPU times: user 2.87 s, sys: 6.77 s, total: 9.64 s
Wall time: 9.83 s

	ind_cco_fin_ult1	...
1375586	1.0	...
1050611	1.0	...
1050612	1.0	...

3 rows × 24 columns

svd

%%time
utilMat = np.array(train1)
mask = np.isnan(utilMat)
mask_zero =

CPU times: user 1.37 s, sys: 11.2 s, total: 12.5 s
Wall time: 12.7 s

np.where(utilMat)

(array([       0,        1,        2, ..., 13647308, 13647308, 13647308]),
 array([ 2,  2,  2, ..., 21, 22, 23]))

print(np.sum(mask))
print(utilMat.shape[0] * utilMat.shape[1])

75917862
327535416

%%time
masked_arr=np.ma.masked_array(utilMat, mask)
item_means=np.mean(masked_arr, axis=0)
print(f'masked_arr.shape - {masked_arr.shape}')
print(f'item_means.shape - {item_means.shape}')

masked_arr.shape - (13647309, 24)
item_means.shape - (24,)
CPU times: user 2.36 s, sys: 9.37 s, total: 11.7 s
Wall time: 11.9 s

print(item_means)

[0.00014086538076379747 3.0805360857621244e-05 0.7842677689708568
 0.00048544670825167844 0.09478923983051138 0.01233997902565028
 0.013272341897923052 0.17154113021149453 0.058354032374049944
 0.0022137895858424282 0.002612161079038354 0.057973781681358506
 0.0969037465041161 0.02239120557495011 0.00805860610224633
 0.012010752692274092 0.0034949301815093824 0.060178987730119864
 0.05746153526288704 0.0327036577694172 0.005214594149384828
 0.0630621993029027 0.07005819202959135 0.1539579452461892]

masked_arr

masked_array(
  data=[[0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
        ...,
        [0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
        [--, --, --, ..., --, --, --]],
  mask=[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [ True,  True,  True, ...,  True,  True,  True]],
  fill_value=1e+20)

%%time
utilMat2 = masked_arr.filled(item_means)
print(f'utilMat2.shape - {utilMat2.shape}')

utilMat2.shape - (13647309, 24)
CPU times: user 1.73 s, sys: 7.63 s, total: 9.36 s
Wall time: 9.89 s

utilMat2

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.40865381e-04, 3.08053609e-05, 7.84267769e-01, ...,
        6.30621993e-02, 7.00581920e-02, 1.53957945e-01]])

%%time
x = np.tile(item_means, (utilMat2.shape[0],1))
print(f'x.shape - {x.shape}')

x.shape - (13647309, 24)
CPU times: user 452 ms, sys: 5.99 s, total: 6.44 s
Wall time: 7.17 s

a = np.array([[1,0],[2,3]])
a

array([[1, 0],
       [2, 3]])

mask = np.isnan(a)
mask

array([[False, False],
       [False, False]])