Sklearn - Adult Income Classificat

Repeating content from http://appliedprogramming.net/machine-learning/adult-income-classification.html

import os
import urllib
from urllib.request import urlretrieve
from pathlib import Path

from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report

# download the data
base_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult'
filename = 'adult.data'
urlretrieve(f'{base_url}/{filename}', f'{filename}')

('adult.data', <http.client.HTTPMessage at 0x7fe9c7480e10>)

cols = ["Age", "Work-Class", "fnlwgt", "Education", "Education-Num", "Marital-Status", "Occupation", 
        "Relationship", "Race", "Sex", "Capital-gain", "Capital-loss", "Hours-per-week", "Native-Country", "Earnings-Raw"]
df = pd.read_csv(filename, header=None, names=cols)
print(f'df.shape - {df.shape}')
df.head(3)

df.shape - (32561, 15)

	Age	Work-Class	fnlwgt	Education	Education-Num	Marital-Status	Occupation	Relationship	Race	Sex	Capital-gain	Hours-per-week	Native-Country	Earnings-Raw
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K

df.replace([' <=50K', ' >50K'], [0, 1], inplace=True)
df.dropna(how='all', inplace=True)

Selecting K Best features

x_cols = ["Age", "Education-Num", "Capital-gain", "Capital-loss", "Hours-per-week"]
x_train_np = df[x_cols].values
y_train_np = df['Earnings-Raw'].values

print(f'x_train_np.shape - {x_train_np.shape}. y_train_np.shape - {y_train_np.shape}')

x_train_np.shape - (32561, 5). y_train_np.shape - (32561,)

Now we create our transfomer using the chi2 function and a SelectKBest transformer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

transformer = SelectKBest(score_func=chi2, k=3)

This will create a new dataset with reduced number of features

x_chi2_train_np = transformer.fit_transform(x_train_np, y_train_np)
print(f'x_chi2_train_np.shape - {x_chi2_train_np.shape}')
print(f'chi square scores - {transformer.scores_}')
print(f'selected columns are {np.argsort(transformer.scores_)[-3:]}')

x_chi2_train_np.shape - (32561, 3)
chi square scores - [0.2340371  0.33515395 0.22332882 0.15052631 0.22968907]
selected columns are [4 0 1]

We could also implement other correlations such as Pearsons correlation coefficient. As the default scipy pearsonr function accepts only a one dimensional array, we will create a wrapper around it.

from scipy.stats import pearsonr

def multivariate_pearsonr(x, y):
    scores, pvalues = [], []
    for column in range(x.shape[1]):
        cur_score, cur_p = pearsonr(x[:, column], y)
        scores.append(abs(cur_score))
        pvalues.append(cur_p)
    return (np.array(scores),np.array(pvalues))

The pearson value could be between -1 and 1.

transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)
x_pearson_train_np = transformer.fit_transform(x_train_np, y_train_np)
print(transformer.scores_)
print(f'selected columns are {np.argsort(transformer.scores_)[-3:]}')

[0.2340371  0.33515395 0.22332882 0.15052631 0.22968907]
selected columns are [4 0 1]

Let us now fit the classifier

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
scores_chi2 = cross_val_score(clf, x_chi2_train_np, y_train_np, scoring='accuracy')
scores_pearson = cross_val_score(clf, x_pearson_train_np, y_train_np, scoring='accuracy')

print("Chi2 performance: {0:.3f}".format(scores_chi2.mean()))
print("Pearson performance: {0:.3f}".format(scores_pearson.mean()))

Chi2 performance: 0.773
Pearson performance: 0.773

Unlike in the blog, we got the same output for selectkbest. Hence the performance is same for both

Principal Component Analysis

from sklearn.decomposition import PCA

pca = PCA(n_components=5)
xd = pca.fit_transform(x_train_np)
print(f'xd.shape - {xd.shape}')

np.set_printoptions(precision=3, suppress=True)
pca.explained_variance_ratio_

xd.shape - (32561, 5)

array([0.997, 0.003, 0.   , 0.   , 0.   ])

This shows us that the first feature accounts for 99.7 percent of the variance in the dataset

# fit with full dataset
clf = DecisionTreeClassifier(random_state=14)
original_scores = cross_val_score(clf, x_train_np, y_train_np, scoring='accuracy')

print("The average score from the original dataset is {:.4f}".format(np.mean(original_scores)))

# fit with reduced features

The average score from the original dataset is 0.8131

x_train_np.shape

(32561, 5)

Lets convert the categorical features into numeric values

categorical_features = []
for col in df.columns:
    if df[col].dtype != np.int64:
        categorical_features.append(col)
        
categorical_features

['Work-Class',
 'Education',
 'Marital-Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Native-Country']