import os
import urllib
from urllib.request import urlretrieve
from pathlib import Path
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report
Repeating content from http://appliedprogramming.net/machine-learning/adult-income-classification.html
# download the data
= 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult'
base_url = 'adult.data'
filename f'{base_url}/{filename}', f'{filename}') urlretrieve(
('adult.data', <http.client.HTTPMessage at 0x7fe9c7480e10>)
= ["Age", "Work-Class", "fnlwgt", "Education", "Education-Num", "Marital-Status", "Occupation",
cols "Relationship", "Race", "Sex", "Capital-gain", "Capital-loss", "Hours-per-week", "Native-Country", "Earnings-Raw"]
= pd.read_csv(filename, header=None, names=cols)
df print(f'df.shape - {df.shape}')
3) df.head(
df.shape - (32561, 15)
Age | Work-Class | fnlwgt | Education | Education-Num | Marital-Status | Occupation | Relationship | Race | Sex | Capital-gain | Capital-loss | Hours-per-week | Native-Country | Earnings-Raw | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
' <=50K', ' >50K'], [0, 1], inplace=True)
df.replace([='all', inplace=True) df.dropna(how
Selecting K Best features
= ["Age", "Education-Num", "Capital-gain", "Capital-loss", "Hours-per-week"]
x_cols = df[x_cols].values
x_train_np = df['Earnings-Raw'].values
y_train_np
print(f'x_train_np.shape - {x_train_np.shape}. y_train_np.shape - {y_train_np.shape}')
x_train_np.shape - (32561, 5). y_train_np.shape - (32561,)
Now we create our transfomer using the chi2 function and a SelectKBest transformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
= SelectKBest(score_func=chi2, k=3) transformer
This will create a new dataset with reduced number of features
= transformer.fit_transform(x_train_np, y_train_np)
x_chi2_train_np print(f'x_chi2_train_np.shape - {x_chi2_train_np.shape}')
print(f'chi square scores - {transformer.scores_}')
print(f'selected columns are {np.argsort(transformer.scores_)[-3:]}')
x_chi2_train_np.shape - (32561, 3)
chi square scores - [0.2340371 0.33515395 0.22332882 0.15052631 0.22968907]
selected columns are [4 0 1]
We could also implement other correlations such as Pearsons correlation coefficient. As the default scipy pearsonr function accepts only a one dimensional array, we will create a wrapper around it.
from scipy.stats import pearsonr
def multivariate_pearsonr(x, y):
= [], []
scores, pvalues for column in range(x.shape[1]):
= pearsonr(x[:, column], y)
cur_score, cur_p abs(cur_score))
scores.append(
pvalues.append(cur_p)return (np.array(scores),np.array(pvalues))
The pearson value could be between -1 and 1.
= SelectKBest(score_func=multivariate_pearsonr, k=3)
transformer = transformer.fit_transform(x_train_np, y_train_np)
x_pearson_train_np print(transformer.scores_)
print(f'selected columns are {np.argsort(transformer.scores_)[-3:]}')
[0.2340371 0.33515395 0.22332882 0.15052631 0.22968907]
selected columns are [4 0 1]
Let us now fit the classifier
from sklearn.tree import DecisionTreeClassifier
= DecisionTreeClassifier()
clf = cross_val_score(clf, x_chi2_train_np, y_train_np, scoring='accuracy')
scores_chi2 = cross_val_score(clf, x_pearson_train_np, y_train_np, scoring='accuracy')
scores_pearson
print("Chi2 performance: {0:.3f}".format(scores_chi2.mean()))
print("Pearson performance: {0:.3f}".format(scores_pearson.mean()))
Chi2 performance: 0.773
Pearson performance: 0.773
Unlike in the blog, we got the same output for selectkbest. Hence the performance is same for both
Principal Component Analysis
from sklearn.decomposition import PCA
= PCA(n_components=5)
pca = pca.fit_transform(x_train_np)
xd print(f'xd.shape - {xd.shape}')
=3, suppress=True)
np.set_printoptions(precision pca.explained_variance_ratio_
xd.shape - (32561, 5)
array([0.997, 0.003, 0. , 0. , 0. ])
This shows us that the first feature accounts for 99.7 percent of the variance in the dataset
# fit with full dataset
= DecisionTreeClassifier(random_state=14)
clf = cross_val_score(clf, x_train_np, y_train_np, scoring='accuracy')
original_scores
print("The average score from the original dataset is {:.4f}".format(np.mean(original_scores)))
# fit with reduced features
The average score from the original dataset is 0.8131
x_train_np.shape
(32561, 5)
Lets convert the categorical features into numeric values
= []
categorical_features for col in df.columns:
if df[col].dtype != np.int64:
categorical_features.append(col)
categorical_features
['Work-Class',
'Education',
'Marital-Status',
'Occupation',
'Relationship',
'Race',
'Sex',
'Native-Country']