Given various features, the aim is to build a predictive model to determine the income level for people in US. The income levels are binned at below 50K and above 50K.
By analysing the dataset using scatterplot and barplot. I managed to decide that the hypothesis and the variables that my model will use are the following.
$H_{0}$ : None of the variables (below)contributes significantly to the prediction of the model.
$H_{1}$ : At least one of the variables (below) contribute significantly impact on the dependent variable.
Variables
This data set contains weighted census data extracted from the 1994 and 1995 current population surveys conducted by the U.S. Census Bureau. The data contains demographic and employment related variables.
Original Owner U.S. Census Bureau United States Department of Commerce
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import statsmodels.api as sm
import numpy as np
import scipy
from sklearn import cluster, preprocessing, linear_model, model_selection, metrics
from sklearn.linear_model import Lasso
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 7)
pd.set_option('display.notebook_repr_html', True)
# Interactivity
from IPython.display import display
from ipywidgets import interactive, Select, Output, widgets
# Plotting (Plot.ly offline mode)
import plotly
from plotly import graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=False)
%matplotlib inline
plt.style.use('ggplot')
df_train = pd.read_csv(os.path.join( 'Dataset', 'train.csv'))
#Droping the columms that have more than 80% missing values
df_train = df_train.loc[:, pd.notnull(df_train).sum()>len(df_train)*.8]
#Here I change the income level to -50000 = 0 and 50000 = 1
df_train["income_level"] = df_train.income_level.map({-50000: 0 , 50000: 1})
df_train
df_train.info()
#Here I create a column where there is only numeric data
dft_num = df_train._get_numeric_data()
dft_num = dft_num.drop(['income_level'], axis=1)
#Here I create a column where there is only non-numeric data
dft_str = df_train.drop(dft_num.columns.tolist(), axis=1)
#Creating a table to understan the probability to belong of a certain income level group
pd.crosstab(index = df_train["income_level"], columns="Frequency")/df_train["income_level"].count()
color = df_train.income_level.map({0: 'red', 1: 'green'})
########
# Define plotting function
def iplot_scatter(X_axis, Y_axis):
df_train.plot(kind = 'scatter', x = X_axis, y = Y_axis, c = color, figsize = (16, 8))
X_Widget = Select(
options=dft_num.columns.tolist(),
description='X_axis',
height='142px',
width='250px'
)
Y_Widget = Select(
options=dft_num.columns.tolist(),
description='Y_axis',
height='142px',
width='250px'
)
def update_iplot_scatter(**kwargs):
X_axis = X_Widget.value
Y_axis = Y_Widget.value
iplot_scatter(X_axis, Y_axis)
interactive(update_iplot_scatter, X_axis = X_Widget, Y_axis = Y_Widget)
# Define plotting function
def plot_bar(columna):
df_bar = dft_str.groupby([columna,'income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))
X_datawid = Select(
options=dft_str.drop(['income_level'], axis=1).columns.tolist(),
description='Data',
height='100px',
width='500px'
)
def update_plot_bar(**kwargs):
X_data = X_datawid.value
plot_bar(X_data)
interactive(update_plot_bar, X_data = X_datawid)
# Selecting the data set that are numerics
X = dft_num[['age','occupation_code','wage_per_hour','capital_gains','dividend_from_Stocks']]
age_category = []
for i in X.age:
if i <= 20:
age_category.append(1)
elif i <= 40:
age_category.append(2)
elif i <= 60:
age_category.append(3)
elif i <= 80:
age_category.append(4)
else:
age_category.append(5)
X['age_category'] = age_category
X.info()
df_train['age_category'] = X.age_category
df_bar = df_train.groupby(['age_category','income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))
dummies = pd.get_dummies(df_train['class_of_worker'])
dummies = dummies.drop(['Not in universe'],axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()
row = pd.Series(df_train.education.unique())
row
df_train['education'] = df_train.education.replace({"10th grade": "Elementary-High School" ,
"Children": "Elementary-High School" ,
"Less than 1st grade": "Elementary-High School" ,
"7th and 8th grade": "Elementary-High School" ,
"12th grade no diploma": "Elementary-High School" ,
"5th or 6th grade": "Elementary-High School" ,
"11th grade": "Elementary-High School" ,
"9th grade": "Elementary-High School" ,
"1st 2nd 3rd or 4th grade": "Elementary-High School" ,})
df_bar = df_train.groupby(['education','income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))
dummies = pd.get_dummies(df_train['education'])
dummies = dummies.drop(['Elementary-High School'],axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()
pd.Series(df_train.marital_status.unique())
df_train['marital_status'] = df_train.marital_status.replace({"Married-civilian spouse present": "Married" ,
"Married-spouse absent": "Married" ,
"Married-A F spouse present": "Married" ,
"Widowed": "Widowed/Divorced/Separated" ,
"Divorced": "Widowed/Divorced/Separated" ,
"Separated": "Widowed/Divorced/Separated" })
df_bar = df_train.groupby(['marital_status','income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))
dummies = pd.get_dummies(df_train['marital_status'])
dummies = dummies.drop('Widowed/Divorced/Separated',axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()
dummies = pd.get_dummies(df_train['race'])
dummies = dummies.drop('Other',axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()
X['sex'] = df_train.sex.map({'Female': 0 , 'Male': 1})
X.info()
pd.Series(df_train.d_household_summary.unique())
df_train['d_household_summary'] = df_train.d_household_summary.replace({"Child 18 or older": "Other relative of householder" ,
"Child under 18 never married": "Other relative of householder" ,
"Child under 18 ever married": "Other relative of householder"})
df_bar = df_train.groupby(['d_household_summary','income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))
dummies = pd.get_dummies(df_train['d_household_summary'])
dummies = dummies.drop('Group Quarters- Secondary individual',axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()
pd.Series(df_train.citizenship.unique())
df_train['citizenship'] = df_train.citizenship.replace({"Native- Born in the United States": "Born in the United States" ,
"Foreign born- Not a citizen of U S": "Not born in the United States" ,
"Foreign born- U S citizen by naturalization": "Not born in the United States" ,
"Native- Born abroad of American Parent(s)": "Not born in the United States" ,
"Native- Born in Puerto Rico or U S Outlying": "Not born in the United States" })
df_bar = df_train.groupby(['citizenship','income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))
X['citizenship'] = df_train.citizenship.map({'Not born in the United States': 0 , 'Born in the United States': 1})
X.info()
dummies = pd.get_dummies(df_train['tax_filer_status'])
dummies = dummies.drop('Nonfiler',axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()
X = X.drop('age',axis = 1)
X = pd.concat([X, df_train[['income_level']]], axis=1)
X
X_1 = X[X['income_level'] == 1]
X_0 = X[X['income_level'] == 0].sample(frac=15000./199523, replace=True)
X_0_oversample = X[X['income_level'] == 0]
X_undersample = pd.concat([X_0, X_1])
pd.crosstab(index = X_undersample["income_level"], columns="Frequency")/X_undersample["income_level"].count()
y_undersample = X_undersample.income_level
X_undersample = X_undersample.drop("income_level", axis = 1)
scaler = preprocessing.MinMaxScaler().fit(X_undersample)
X1 = pd.DataFrame(scaler.transform(X_undersample), columns = X_undersample.columns)
gs = model_selection.GridSearchCV(
estimator = linear_model.LogisticRegression(),
param_grid = {'C': np.logspace(-10, 10, 21), 'penalty': ['l1']},
cv = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 0)
)
gs.fit(X1, y_undersample)
model = gs.best_estimator_.fit(X1, y_undersample)
print model.intercept_
coef_lasso = pd.DataFrame(index=X1.columns, columns= {"Coef"})
coef_lasso["Coef"] = model.coef_[0]
coef_lasso
X = X.drop("income_level", axis = 1)
y_hat = model.predict(X)
pd.crosstab(df_train.income_level,
y_hat,
rownames = ['Hypothesized Class'],
colnames = ['True Class'])
model.score(X,df_train.income_level)
p_hat = model.predict_proba(X).T[1]
fpr, tpr, thresholds = metrics.roc_curve(df_train.income_level, p_hat)
plt.figure()
plt.plot(fpr, tpr, label = 'ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0, 1.01])
plt.ylim([0, 1.01])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("Income Level ROC for undersampling model")
plt.legend(loc = 'lower right')