Problem Statement and Hypothesis Generation¶

Given various features, the aim is to build a predictive model to determine the income level for people in US. The income levels are binned at below 50K and above 50K.

By analysing the dataset using scatterplot and barplot. I managed to decide that the hypothesis and the variables that my model will use are the following.

$H_{0}$ : None of the variables (below)contributes significantly to the prediction of the model.

$H_{1}$ : At least one of the variables (below) contribute significantly impact on the dependent variable.

Variables

Age
Ocupation Code
Wage per hour
Capital Gains
Dividends from stock
Class of workers
Education
Maritial Status
Race
Sex
Household Summary
Citizenship
Tax Status

Info of the Data¶

This data set contains weighted census data extracted from the 1994 and 1995 current population surveys conducted by the U.S. Census Bureau. The data contains demographic and employment related variables.

Original Owner U.S. Census Bureau United States Department of Commerce

Library¶

import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import statsmodels.api as sm
import numpy as np
import scipy
from sklearn import cluster, preprocessing, linear_model, model_selection, metrics
from sklearn.linear_model import Lasso
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 7)
pd.set_option('display.notebook_repr_html', True)

# Interactivity
from IPython.display import display
from ipywidgets import interactive, Select, Output, widgets

# Plotting (Plot.ly offline mode)
import plotly
from plotly import graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=False)

%matplotlib inline
plt.style.use('ggplot')

Dataset Train¶

df_train = pd.read_csv(os.path.join( 'Dataset', 'train.csv'))

#Droping the columms that have more than 80% missing values
df_train = df_train.loc[:, pd.notnull(df_train).sum()>len(df_train)*.8]

#Here I change the income level to -50000 = 0 and 50000 = 1
df_train["income_level"] = df_train.income_level.map({-50000: 0 , 50000: 1})

df_train

Summary Table¶

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 37 columns):
age                                 199523 non-null int64
class_of_worker                     199523 non-null object
industry_code                       199523 non-null int64
occupation_code                     199523 non-null int64
education                           199523 non-null object
wage_per_hour                       199523 non-null int64
enrolled_in_edu_inst_lastwk         199523 non-null object
marital_status                      199523 non-null object
major_industry_code                 199523 non-null object
major_occupation_code               199523 non-null object
race                                199523 non-null object
hispanic_origin                     198649 non-null object
sex                                 199523 non-null object
member_of_labor_union               199523 non-null object
reason_for_unemployment             199523 non-null object
full_parttime_employment_stat       199523 non-null object
capital_gains                       199523 non-null int64
capital_losses                      199523 non-null int64
dividend_from_Stocks                199523 non-null int64
tax_filer_status                    199523 non-null object
region_of_previous_residence        199523 non-null object
state_of_previous_residence         198815 non-null object
d_household_family_stat             199523 non-null object
d_household_summary                 199523 non-null object
live_1_year_ago                     199523 non-null object
num_person_Worked_employer          199523 non-null int64
family_members_under_18             199523 non-null object
country_father                      192810 non-null object
country_mother                      193404 non-null object
country_self                        196130 non-null object
citizenship                         199523 non-null object
business_or_self_employed           199523 non-null int64
fill_questionnaire_veteran_admin    199523 non-null object
veterans_benefits                   199523 non-null int64
weeks_worked_in_year                199523 non-null int64
year                                199523 non-null int64
income_level                        199523 non-null int64
dtypes: int64(13), object(24)
memory usage: 56.3+ MB

#Here I create a column where there is only numeric data
dft_num = df_train._get_numeric_data()
dft_num = dft_num.drop(['income_level'], axis=1)

#Here I create a column where there is only non-numeric data
dft_str = df_train.drop(dft_num.columns.tolist(), axis=1)

#Creating a table to understan the probability to belong of a certain income level group
pd.crosstab(index = df_train["income_level"], columns="Frequency")/df_train["income_level"].count()

Descriptive Analysis¶

Scatter Plot for numeric Data¶

color = df_train.income_level.map({0: 'red', 1: 'green'})

########
# Define plotting function
def iplot_scatter(X_axis, Y_axis):
    df_train.plot(kind = 'scatter', x = X_axis, y = Y_axis, c = color, figsize = (16, 8))
    
X_Widget = Select(
    options=dft_num.columns.tolist(),
    description='X_axis',
    height='142px',
    width='250px'
)

Y_Widget = Select(
    options=dft_num.columns.tolist(),
    description='Y_axis',
    height='142px',
    width='250px'
)

def update_iplot_scatter(**kwargs):
    X_axis = X_Widget.value
    Y_axis = Y_Widget.value
    
    iplot_scatter(X_axis, Y_axis)

interactive(update_iplot_scatter, X_axis = X_Widget, Y_axis = Y_Widget)

Bar Plot for Non-numeric data¶

# Define plotting function
def plot_bar(columna):
    df_bar = dft_str.groupby([columna,'income_level']).size()
    df_bar=df_bar.unstack()
    df_bar.plot(kind='bar',figsize = (16, 8))
    
X_datawid = Select(
    options=dft_str.drop(['income_level'], axis=1).columns.tolist(),
    description='Data',
    height='100px',
    width='500px'
)

def update_plot_bar(**kwargs):
    X_data = X_datawid.value
    
    plot_bar(X_data)

interactive(update_plot_bar, X_data = X_datawid)

Modifying the Dataset¶

# Selecting the data set that are numerics
X = dft_num[['age','occupation_code','wage_per_hour','capital_gains','dividend_from_Stocks']]

Age¶

age_category = []
for i in X.age:
    if i <= 20:
        age_category.append(1)
    elif i <= 40:
        age_category.append(2)
    elif i <= 60:
        age_category.append(3)
    elif i <= 80:
        age_category.append(4)
    else:
        age_category.append(5)

X['age_category'] = age_category

C:\Users\Fidel\Anaconda2\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 6 columns):
age                     199523 non-null int64
occupation_code         199523 non-null int64
wage_per_hour           199523 non-null int64
capital_gains           199523 non-null int64
dividend_from_Stocks    199523 non-null int64
age_category            199523 non-null int64
dtypes: int64(6)
memory usage: 9.1 MB

df_train['age_category'] = X.age_category
df_bar = df_train.groupby(['age_category','income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))

<matplotlib.axes._subplots.AxesSubplot at 0x150cdeb8>

Class of workers¶

dummies = pd.get_dummies(df_train['class_of_worker'])
dummies = dummies.drop(['Not in universe'],axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 14 columns):
age                               199523 non-null int64
occupation_code                   199523 non-null int64
wage_per_hour                     199523 non-null int64
capital_gains                     199523 non-null int64
dividend_from_Stocks              199523 non-null int64
age_category                      199523 non-null int64
Federal government                199523 non-null uint8
Local government                  199523 non-null uint8
Never worked                      199523 non-null uint8
Private                           199523 non-null uint8
Self-employed-incorporated        199523 non-null uint8
Self-employed-not incorporated    199523 non-null uint8
State government                  199523 non-null uint8
Without pay                       199523 non-null uint8
dtypes: int64(6), uint8(8)
memory usage: 10.7 MB

Education¶

row = pd.Series(df_train.education.unique())
row

0           High school graduate
1     Some college but no degree
2                     10th grade
                 ...            
14     Doctorate degree(PhD EdD)
15                     9th grade
16      1st 2nd 3rd or 4th grade
dtype: object

df_train['education'] = df_train.education.replace({"10th grade": "Elementary-High School" , 
                                                "Children": "Elementary-High School" ,
                                                "Less than 1st grade": "Elementary-High School" ,
                                                "7th and 8th grade": "Elementary-High School" , 
                                                "12th grade no diploma": "Elementary-High School" , 
                                                "5th or 6th grade": "Elementary-High School" , 
                                                "11th grade": "Elementary-High School" , 
                                                "9th grade": "Elementary-High School" , 
                                                "1st 2nd 3rd or 4th grade": "Elementary-High School" ,})

df_bar = df_train.groupby(['education','income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))

<matplotlib.axes._subplots.AxesSubplot at 0x150cd1d0>

dummies = pd.get_dummies(df_train['education'])
dummies = dummies.drop(['Elementary-High School'],axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 22 columns):
age                                       199523 non-null int64
occupation_code                           199523 non-null int64
wage_per_hour                             199523 non-null int64
capital_gains                             199523 non-null int64
dividend_from_Stocks                      199523 non-null int64
age_category                              199523 non-null int64
Federal government                        199523 non-null uint8
Local government                          199523 non-null uint8
Never worked                              199523 non-null uint8
Private                                   199523 non-null uint8
Self-employed-incorporated                199523 non-null uint8
Self-employed-not incorporated            199523 non-null uint8
State government                          199523 non-null uint8
Without pay                               199523 non-null uint8
Associates degree-academic program        199523 non-null uint8
Associates degree-occup /vocational       199523 non-null uint8
Bachelors degree(BA AB BS)                199523 non-null uint8
Doctorate degree(PhD EdD)                 199523 non-null uint8
High school graduate                      199523 non-null uint8
Masters degree(MA MS MEng MEd MSW MBA)    199523 non-null uint8
Prof school degree (MD DDS DVM LLB JD)    199523 non-null uint8
Some college but no degree                199523 non-null uint8
dtypes: int64(6), uint8(16)
memory usage: 12.2 MB

Marital Status¶

pd.Series(df_train.marital_status.unique())

0                            Widowed
1                           Divorced
2                      Never married
3    Married-civilian spouse present
4                          Separated
5              Married-spouse absent
6         Married-A F spouse present
dtype: object

df_train['marital_status'] = df_train.marital_status.replace({"Married-civilian spouse present": "Married" , 
                                                "Married-spouse absent": "Married" ,
                                                "Married-A F spouse present": "Married" ,
                                                "Widowed": "Widowed/Divorced/Separated" , 
                                                "Divorced": "Widowed/Divorced/Separated" , 
                                                "Separated": "Widowed/Divorced/Separated" })

df_bar = df_train.groupby(['marital_status','income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))

<matplotlib.axes._subplots.AxesSubplot at 0x121d7898>

dummies = pd.get_dummies(df_train['marital_status'])
dummies = dummies.drop('Widowed/Divorced/Separated',axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 24 columns):
age                                       199523 non-null int64
occupation_code                           199523 non-null int64
wage_per_hour                             199523 non-null int64
capital_gains                             199523 non-null int64
dividend_from_Stocks                      199523 non-null int64
age_category                              199523 non-null int64
Federal government                        199523 non-null uint8
Local government                          199523 non-null uint8
Never worked                              199523 non-null uint8
Private                                   199523 non-null uint8
Self-employed-incorporated                199523 non-null uint8
Self-employed-not incorporated            199523 non-null uint8
State government                          199523 non-null uint8
Without pay                               199523 non-null uint8
Associates degree-academic program        199523 non-null uint8
Associates degree-occup /vocational       199523 non-null uint8
Bachelors degree(BA AB BS)                199523 non-null uint8
Doctorate degree(PhD EdD)                 199523 non-null uint8
High school graduate                      199523 non-null uint8
Masters degree(MA MS MEng MEd MSW MBA)    199523 non-null uint8
Prof school degree (MD DDS DVM LLB JD)    199523 non-null uint8
Some college but no degree                199523 non-null uint8
Married                                   199523 non-null uint8
Never married                             199523 non-null uint8
dtypes: int64(6), uint8(18)
memory usage: 12.6 MB

Race¶

dummies = pd.get_dummies(df_train['race'])
dummies = dummies.drop('Other',axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 28 columns):
age                                       199523 non-null int64
occupation_code                           199523 non-null int64
wage_per_hour                             199523 non-null int64
capital_gains                             199523 non-null int64
dividend_from_Stocks                      199523 non-null int64
age_category                              199523 non-null int64
Federal government                        199523 non-null uint8
Local government                          199523 non-null uint8
Never worked                              199523 non-null uint8
Private                                   199523 non-null uint8
Self-employed-incorporated                199523 non-null uint8
Self-employed-not incorporated            199523 non-null uint8
State government                          199523 non-null uint8
Without pay                               199523 non-null uint8
Associates degree-academic program        199523 non-null uint8
Associates degree-occup /vocational       199523 non-null uint8
Bachelors degree(BA AB BS)                199523 non-null uint8
Doctorate degree(PhD EdD)                 199523 non-null uint8
High school graduate                      199523 non-null uint8
Masters degree(MA MS MEng MEd MSW MBA)    199523 non-null uint8
Prof school degree (MD DDS DVM LLB JD)    199523 non-null uint8
Some college but no degree                199523 non-null uint8
Married                                   199523 non-null uint8
Never married                             199523 non-null uint8
Amer Indian Aleut or Eskimo               199523 non-null uint8
Asian or Pacific Islander                 199523 non-null uint8
Black                                     199523 non-null uint8
White                                     199523 non-null uint8
dtypes: int64(6), uint8(22)
memory usage: 13.3 MB

Sex¶

X['sex'] = df_train.sex.map({'Female': 0 , 'Male': 1})
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 29 columns):
age                                       199523 non-null int64
occupation_code                           199523 non-null int64
wage_per_hour                             199523 non-null int64
capital_gains                             199523 non-null int64
dividend_from_Stocks                      199523 non-null int64
age_category                              199523 non-null int64
Federal government                        199523 non-null uint8
Local government                          199523 non-null uint8
Never worked                              199523 non-null uint8
Private                                   199523 non-null uint8
Self-employed-incorporated                199523 non-null uint8
Self-employed-not incorporated            199523 non-null uint8
State government                          199523 non-null uint8
Without pay                               199523 non-null uint8
Associates degree-academic program        199523 non-null uint8
Associates degree-occup /vocational       199523 non-null uint8
Bachelors degree(BA AB BS)                199523 non-null uint8
Doctorate degree(PhD EdD)                 199523 non-null uint8
High school graduate                      199523 non-null uint8
Masters degree(MA MS MEng MEd MSW MBA)    199523 non-null uint8
Prof school degree (MD DDS DVM LLB JD)    199523 non-null uint8
Some college but no degree                199523 non-null uint8
Married                                   199523 non-null uint8
Never married                             199523 non-null uint8
Amer Indian Aleut or Eskimo               199523 non-null uint8
Asian or Pacific Islander                 199523 non-null uint8
Black                                     199523 non-null uint8
White                                     199523 non-null uint8
sex                                       199523 non-null int64
dtypes: int64(7), uint8(22)
memory usage: 14.8 MB

Household Summary¶

pd.Series(df_train.d_household_summary.unique())

0           Other relative of householder
1                             Householder
2                       Child 18 or older
                     ...                 
5              Nonrelative of householder
6    Group Quarters- Secondary individual
7             Child under 18 ever married
dtype: object

df_train['d_household_summary'] = df_train.d_household_summary.replace({"Child 18 or older": "Other relative of householder" , 
                                                "Child under 18 never married": "Other relative of householder" ,
                                                "Child under 18 ever married": "Other relative of householder"})

df_bar = df_train.groupby(['d_household_summary','income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))

<matplotlib.axes._subplots.AxesSubplot at 0x15051208>

dummies = pd.get_dummies(df_train['d_household_summary'])
dummies = dummies.drop('Group Quarters- Secondary individual',axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 33 columns):
age                                       199523 non-null int64
occupation_code                           199523 non-null int64
wage_per_hour                             199523 non-null int64
capital_gains                             199523 non-null int64
dividend_from_Stocks                      199523 non-null int64
age_category                              199523 non-null int64
Federal government                        199523 non-null uint8
Local government                          199523 non-null uint8
Never worked                              199523 non-null uint8
Private                                   199523 non-null uint8
Self-employed-incorporated                199523 non-null uint8
Self-employed-not incorporated            199523 non-null uint8
State government                          199523 non-null uint8
Without pay                               199523 non-null uint8
Associates degree-academic program        199523 non-null uint8
Associates degree-occup /vocational       199523 non-null uint8
Bachelors degree(BA AB BS)                199523 non-null uint8
Doctorate degree(PhD EdD)                 199523 non-null uint8
High school graduate                      199523 non-null uint8
Masters degree(MA MS MEng MEd MSW MBA)    199523 non-null uint8
Prof school degree (MD DDS DVM LLB JD)    199523 non-null uint8
Some college but no degree                199523 non-null uint8
Married                                   199523 non-null uint8
Never married                             199523 non-null uint8
Amer Indian Aleut or Eskimo               199523 non-null uint8
Asian or Pacific Islander                 199523 non-null uint8
Black                                     199523 non-null uint8
White                                     199523 non-null uint8
sex                                       199523 non-null int64
Householder                               199523 non-null uint8
Nonrelative of householder                199523 non-null uint8
Other relative of householder             199523 non-null uint8
Spouse of householder                     199523 non-null uint8
dtypes: int64(7), uint8(26)
memory usage: 15.6 MB

Citizenship¶

pd.Series(df_train.citizenship.unique())

0              Native- Born in the United States
1             Foreign born- Not a citizen of U S
2    Foreign born- U S citizen by naturalization
3      Native- Born abroad of American Parent(s)
4    Native- Born in Puerto Rico or U S Outlying
dtype: object

df_train['citizenship'] = df_train.citizenship.replace({"Native- Born in the United States": "Born in the United States" , 
                                                "Foreign born- Not a citizen of U S": "Not born in the United States" ,
                                                "Foreign born- U S citizen by naturalization": "Not born in the United States"  , 
                                                "Native- Born abroad of American Parent(s)": "Not born in the United States" ,
                                                "Native- Born in Puerto Rico or U S Outlying": "Not born in the United States" })

df_bar = df_train.groupby(['citizenship','income_level']).size()
df_bar=df_bar.unstack()
df_bar.plot(kind='bar',figsize = (16, 8))

<matplotlib.axes._subplots.AxesSubplot at 0xe4e1550>

X['citizenship'] = df_train.citizenship.map({'Not born in the United States': 0 , 'Born in the United States': 1})
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 34 columns):
age                                       199523 non-null int64
occupation_code                           199523 non-null int64
wage_per_hour                             199523 non-null int64
capital_gains                             199523 non-null int64
dividend_from_Stocks                      199523 non-null int64
age_category                              199523 non-null int64
Federal government                        199523 non-null uint8
Local government                          199523 non-null uint8
Never worked                              199523 non-null uint8
Private                                   199523 non-null uint8
Self-employed-incorporated                199523 non-null uint8
Self-employed-not incorporated            199523 non-null uint8
State government                          199523 non-null uint8
Without pay                               199523 non-null uint8
Associates degree-academic program        199523 non-null uint8
Associates degree-occup /vocational       199523 non-null uint8
Bachelors degree(BA AB BS)                199523 non-null uint8
Doctorate degree(PhD EdD)                 199523 non-null uint8
High school graduate                      199523 non-null uint8
Masters degree(MA MS MEng MEd MSW MBA)    199523 non-null uint8
Prof school degree (MD DDS DVM LLB JD)    199523 non-null uint8
Some college but no degree                199523 non-null uint8
Married                                   199523 non-null uint8
Never married                             199523 non-null uint8
Amer Indian Aleut or Eskimo               199523 non-null uint8
Asian or Pacific Islander                 199523 non-null uint8
Black                                     199523 non-null uint8
White                                     199523 non-null uint8
sex                                       199523 non-null int64
Householder                               199523 non-null uint8
Nonrelative of householder                199523 non-null uint8
Other relative of householder             199523 non-null uint8
Spouse of householder                     199523 non-null uint8
citizenship                               199523 non-null int64
dtypes: int64(8), uint8(26)
memory usage: 17.1 MB

Tax Filer Status¶

dummies = pd.get_dummies(df_train['tax_filer_status'])
dummies = dummies.drop('Nonfiler',axis = 1)
X = pd.concat([X, dummies], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 39 columns):
age                                       199523 non-null int64
occupation_code                           199523 non-null int64
wage_per_hour                             199523 non-null int64
capital_gains                             199523 non-null int64
dividend_from_Stocks                      199523 non-null int64
age_category                              199523 non-null int64
Federal government                        199523 non-null uint8
Local government                          199523 non-null uint8
Never worked                              199523 non-null uint8
Private                                   199523 non-null uint8
Self-employed-incorporated                199523 non-null uint8
Self-employed-not incorporated            199523 non-null uint8
State government                          199523 non-null uint8
Without pay                               199523 non-null uint8
Associates degree-academic program        199523 non-null uint8
Associates degree-occup /vocational       199523 non-null uint8
Bachelors degree(BA AB BS)                199523 non-null uint8
Doctorate degree(PhD EdD)                 199523 non-null uint8
High school graduate                      199523 non-null uint8
Masters degree(MA MS MEng MEd MSW MBA)    199523 non-null uint8
Prof school degree (MD DDS DVM LLB JD)    199523 non-null uint8
Some college but no degree                199523 non-null uint8
Married                                   199523 non-null uint8
Never married                             199523 non-null uint8
Amer Indian Aleut or Eskimo               199523 non-null uint8
Asian or Pacific Islander                 199523 non-null uint8
Black                                     199523 non-null uint8
White                                     199523 non-null uint8
sex                                       199523 non-null int64
Householder                               199523 non-null uint8
Nonrelative of householder                199523 non-null uint8
Other relative of householder             199523 non-null uint8
Spouse of householder                     199523 non-null uint8
citizenship                               199523 non-null int64
Head of household                         199523 non-null uint8
Joint both 65+                            199523 non-null uint8
Joint both under 65                       199523 non-null uint8
Joint one under 65 & one 65+              199523 non-null uint8
Single                                    199523 non-null uint8
dtypes: int64(8), uint8(31)
memory usage: 18.1 MB

X = X.drop('age',axis = 1)

X = pd.concat([X, df_train[['income_level']]], axis=1)

X

Undersampling¶

X_1 = X[X['income_level'] == 1]

X_0 = X[X['income_level'] == 0].sample(frac=15000./199523, replace=True)
X_0_oversample = X[X['income_level'] == 0]

X_undersample = pd.concat([X_0, X_1])

pd.crosstab(index = X_undersample["income_level"], columns="Frequency")/X_undersample["income_level"].count()

y_undersample = X_undersample.income_level

X_undersample = X_undersample.drop("income_level", axis = 1)

Lasso Regularization¶

scaler = preprocessing.MinMaxScaler().fit(X_undersample)

X1 = pd.DataFrame(scaler.transform(X_undersample), columns = X_undersample.columns)

gs = model_selection.GridSearchCV(
    estimator = linear_model.LogisticRegression(),
    param_grid = {'C': np.logspace(-10, 10, 21), 'penalty': ['l1']},
    cv = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 0)
)

gs.fit(X1, y_undersample)

GridSearchCV(cv=KFold(n_splits=5, random_state=0, shuffle=True),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1'], 'C': array([  1.00000e-10,   1.00000e-09,   1.00000e-08,   1.00000e-07,
         1.00000e-06,   1.00000e-05,   1.00000e-04,   1.00000e-03,
         1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02,   1.00000e+03,   1.00000e+04,   1.00000e+05,
         1.00000e+06,   1.00000e+07,   1.00000e+08,   1.00000e+09,
         1.00000e+10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

model = gs.best_estimator_.fit(X1, y_undersample)
print model.intercept_

[-6.72682073]

coef_lasso = pd.DataFrame(index=X1.columns, columns= {"Coef"})
coef_lasso["Coef"] = model.coef_[0]
coef_lasso

X = X.drop("income_level", axis = 1)

y_hat = model.predict(X)

pd.crosstab(df_train.income_level,
    y_hat,
    rownames = ['Hypothesized Class'],
    colnames = ['True Class'])

model.score(X,df_train.income_level)

0.69158442886283789

p_hat = model.predict_proba(X).T[1]

fpr, tpr, thresholds = metrics.roc_curve(df_train.income_level, p_hat)

plt.figure()
plt.plot(fpr, tpr, label = 'ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0, 1.01])
plt.ylim([0, 1.01])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("Income Level ROC for undersampling model")
plt.legend(loc = 'lower right')

<matplotlib.legend.Legend at 0x12c02eb8>

col_0	Frequency
income_level
0	0.937942
1	0.062058

col_0	Frequency
income_level
0	0.531889
1	0.468111

True Class	0	1
Hypothesized Class
0	129447	57694
1	3842	8540

	age	class_of_worker	industry_code	...	weeks_worked_in_year	year	income_level
0	73	Not in universe	0	...	0	95	0
1	58	Self-employed-not incorporated	4	...	52	94	0
2	18	Not in universe	0	...	0	95	0
...	...	...	...	...	...	...	...
199520	47	Not in universe	0	...	52	95	0
199521	16	Not in universe	0	...	0	95	0
199522	32	Private	42	...	52	94	0

	Coef
occupation_code	-1.778789
wage_per_hour	0.607547
capital_gains	13.805687
...	...
Joint both under 65	2.486173
Joint one under 65 & one 65+	2.008606
Single	2.299712