import itertools, collections # ad increments to a index of loop
import pandas as pan
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.tools.plotting import scatter_matrix
from sklearn.tree import DecisionTreeRegressor # bining
from sklearn import preprocessing # Categorical attr to numeric
from sklearn.cluster import KMeans
from sklearn import decomposition
#plt.style.use('ggplot')
plt.style.use('fivethirtyeight')
%matplotlib inline
def load_inst_data():
""" Loads 2012_to_2014_institutions_data file. Adding a new field with the unified
name of the respondant institution. In addition it creates a dictionary
with the unique id and the name given to each institution
"""
' institutions data'
inst_data = pan.read_csv('2012_to_2014_institutions_data.csv',low_memory=False,na_values=['',' ',])
# setting Id for institution and year
inst_data['inst_id_yr'] = inst_data.apply(lambda row: str(row['Agency_Code'])+'-'+str(row['Respondent_ID']),axis=1)
# unique responders
inst_ids = list(inst_data.inst_id_yr.unique())
# names given
inst_idn = inst_data[['inst_id_yr','Respondent_Name_TS']]
inst_id_names = {}
for i in inst_ids:
names = inst_idn.loc[inst_idn.inst_id_yr == i]
inst_id_names[i] = list(names.Respondent_Name_TS.unique())
# stop words to delete from the institution name
stop_words = ['INC','CO','CIA','LLC','CORP','NA','FCU','CFCU']
#dictionary of institutions names and list of all known names
inst_name = {} # key is respondent id, value name given
for inst in inst_id_names:
# inst='7-34-2000096'
# select the shortest name
uname = min(inst_id_names[inst],key=len)
# removing punctuation
name = uname.replace('.', '')
name = name.replace(',', '')
name = name.replace("'", '')
# name to list
temp = [x.strip().upper() for x in name.split(' ')]
# locating stop words index
dtemp =[]
for i in range(len(temp)):
try:
if temp[i] in stop_words:
dtemp.append(i)
except:
continue
# removing stop words
for i in range(len(dtemp)):
temp.pop(dtemp[i])
if i < len(dtemp)-1:
dtemp[i+1] -=1
# record names
name = " ".join(x for x in temp)
if inst not in inst_name:
inst_name[inst] = name
inst_data['Respondent_Name'] = inst_data.apply(lambda row: inst_name[row['inst_id_yr']],axis=1)
return inst_data,inst_name
# discretizing data function
# since i have outliers i will create the bins considering the spread from 2 standard deviations from de median
def Create_bins(data,variable='Loan_Amount_000',bin_name = None,quantiy_label=False,num_bins=None,
regression=False,attributes=None,max_depth=None,min_samples_leaf=10000):
""" Creates bins automatically using two aproaches:
1) Bining to a given number of bins. Calculated using statistics of data within the IQR outliers
identification method [Q2-1.IQR,Q3+1.5IQR]. Doing so we eliminate the noise of possible outliers.
2) Binig by regression using decision trees.
Parameters
----------
data: Panda Dataframe tha contains the variable and where the bin class of each instance will be set
variable: column name of the data what would be discretize.
bin_name: Column name where the bin will be stored in the dataframe. Default: variable_bin
i.ei I f variable = 'loan_amnt' then bin_name = loan_amnt_bin
quantiy_label: Optional label to be added to each bin label ( %, k, $, etc).
num_bins: number of bins to be created, ignore when regression is True.
regression: Bolean, use decision trees to calculate bins. True / False (default).
attributes: List with columns names of the data provided to by use in the regressor tree.
max_depth : int or None, optional (default=int(round(len(data)**.5)))
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.:
min_samples_split : int, float, optional (default=2)
The minimum number of samples required to split an internal node
"""
#data validation
if type(variable) != str:
try:
int('e')
except ValueError as e:
raise Exception('variable must be a string') from e
if regression == True:
# provigind a valid list of attributes
try:
len(attributes)
except TypeError as e:
raise Exception('''Attributes must be provided as a list containing the names of the columns to be use
by the decision tree''') from e
# valid number
if max_depth == None:
max_depth=int(round(len(data)**.5))
else:
try:
int(max_depth)
except ValueError as e:
raise Exception('max_depth must be an integer') from e
# valid number
try:
int(min_samples_leaf)
except ValueError as e:
raise Exception('min_samples_leaf must be an integer') from e
else:
# bin by fixed number
# valid number
if num_bins != None:
try:
int(num_bins)
except ValueError as e:
raise Exception('num_bins must be an integer') from e
# bin creation
if regression == True:
variable= 'Loan_Amount_000'
#x = data[data.columns.difference([variable])].copy()
x = loan_data[attributes].copy()
x.dropna(inplace=True)
y = x.pop(variable)
DTC = DecisionTreeRegressor(max_depth=max_depth,min_samples_leaf=min_samples_leaf)
DTC.fit(x,y)
thrs_out = np.unique(DTC.tree_.threshold[DTC.tree_.feature > -2] )
bins = np.sort(thrs_out)
ln = len(thrs_out)
labels = []
for i in range(1,ln):
if i == ln:
labels.append('+'+str(round(thrs_out[i-1],2))+str(quantiy_label if quantiy_label != None else ""))
else:
labels.append(str(round(thrs_out[i-1],2))+'-'+str(round(thrs_out[i],2))+
str(quantiy_label if quantiy_label != None else ""))
else:
# number of bins tobe created
num_bins = num_bins if num_bins != None else 40
IQR = data[variable].describe()[-2]- data[variable].describe()[-4]
low = max(0,(data[variable].describe()[-4] - IQR*1.5))
hig = (data[variable].describe()[-2] + IQR*1.5)
cut = (hig - low )/ num_bins
# starting point of bins
bins = [0]
for i in range(num_bins):
bins.append(int(bins[i]+cut))
labels = []
for i in range(1,num_bins+1):
if i == num_bins:
labels.append('+'+str(bins[i-1])+str(quantiy_label if quantiy_label != None else ""))
else:
labels.append(str(bins[i-1])+'-'+str(bins[i])+str(quantiy_label if quantiy_label != None else ""))
# setting bin in each instance
if bin_name == None:
bin_name = variable+'_bin'
data[bin_name] = pan.cut(data[variable],bins,labels=labels)
return data
def load_loan_data(variable_to_bin='Loan_Amount_000',bin_name = None,quantiy_label=False,num_bins=None,
regression=False,attributes=None,max_depth=None,min_samples_leaf=10000):
''' Loads loan data converting to null any value like "" or " ".
Returns a panda dataframe
'''
loan_data = pan.read_csv('2012_to_2014_loans_data.csv',na_values=['',' ',],low_memory=False)
'---------------- ---------------- ---------------- ----------------'
' data preprocessing '
loan_data.dtypes
# converting object to numeric values
col_to_numeric =['Applicant_Income_000',
'Census_Tract_Number',
'County_Code',
'FFIEC_Median_Family_Income',
'MSA_MD',
'Number_of_Owner_Occupied_Units',
'Respondent_ID',
'Tract_to_MSA_MD_Income_Pct']
for var in col_to_numeric:
loan_data[var+'_num'] = pan.to_numeric(loan_data[var],errors='coerce')
# numeric attributes
num_attr = [x for x in loan_data.columns if '_num' in x]
num_attr.extend(['Agency_Code','Loan_Amount_000','As_of_Year','Sequence_Number',
'State_Code','Conforming_Limit_000'])
#categorical attributes
cat_attr = ['Loan_Purpose_Description',
'Lien_Status_Description',
'Loan_Type_Description',
'State','County_Name',
'Conventional_Status',
'Conforming_Status',
'Conventional_Conforming_Flag']
# list with cat attributes with descriptions of a numerical attribute:
desc_attr = ['MSA_MD_Description','Agency_Code_Description']
# creating numerical values for all categorical attributes
cat_attr_num = []
for attr in cat_attr:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(loan_data[attr].values))
loan_data[attr+'_cat'] = lbl.transform(list(loan_data[attr].values))
cat_attr_num.append(attr+'_cat')
x_attr = num_attr.copy()
x_attr.extend(cat_attr_num)
# id for loan institution
loan_data['inst_id_yr'] = loan_data.apply(lambda row: str(row['Agency_Code'])+'-'+str(row['Respondent_ID']),axis=1)
loan_data = Create_bins(loan_data,
variable='Loan_Amount_000' if variable_to_bin == 'Loan_Amount_000' else variable_to_bin,
bin_name = None if bin_name == None else bin_name,
quantiy_label='k' if quantiy_label == False else quantiy_label,
num_bins=40 if num_bins == None else num_bins ,
regression=False if regression == False else True,
attributes=x_attr if attributes == None else attributes,
max_depth=None if max_depth == None else max_depth,
min_samples_leaf=10000 if min_samples_leaf==None else min_samples_leaf)
return loan_data, x_attr, cat_attr
def hmda_init(cats=False,variable_to_bin='Loan_Amount_000',bin_name = None,quantiy_label=False,num_bins=None,
regression=False,attributes=None,max_depth=None,min_samples_leaf=10000):
''' Merging institutions and loan data using load_loan_data() and load_inst_data() functions.
Both source files mus be located in the same directory where the Python instance is running. In addition,
both files must be named as shown bellow.
Institutions file name : 2012_to_2014_institutions_data.csv
Loans file names : 2012_to_2014_loans_data.csv
This functios returns the following objects:
- By default returns data, a panda dataframe with the loans and institutions data merged.
- Optional to get two additional list x_attr and cat_attr, containing the list of names of numerical
attributes and categorical attributes respectively.
Parameters:
===========
cats: Bolean, when True the list x_attr and cat_attr are returned. Drfault to False.
data: Panda Dataframe tha contains the variable and where the bin class of each instance will be set
variable: String with column name of the data what would be discretize.
bin_name: String with name where the bin will be stored in the dataframe. Default: variable+'_bin'
i.ei I f variable = 'loan_amnt' then bin_name = loan_amnt_bin
quantiy_label: Optional string label to be added to each bin label ( %, k, $, etc). Default to "k"
num_bins: number of bins to be created, ignore when regression is True.
regression: Bolean, use decision trees to calculate bins. True / False(default).
attributes: List with columns names as strings of the data provided to by use in the regressor tree.
max_depth : int or None, optional (default=int(round(len(data)**.5)))
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.:
min_samples_split : int, float, optional (default=2)
The minimum number of samples required to split an internal node
'''
# institutions data
inst_data,inst_name = load_inst_data()
# no need of the colums in merge
inst_data.drop('Agency_Code',axis=1,inplace=True)
inst_data.drop('Respondent_ID',axis=1,inplace=True)
# loan data
loan_data, x_attr, cat_attr = load_loan_data()
# data has the datat gerge
data = pan.DataFrame()
# for each year set the institution data
for year in loan_data.As_of_Year.unique():
d = loan_data.query('As_of_Year == @year').copy()
i = inst_data.query('As_of_Year == @year').copy()
temp= pan.merge(left=d,right=i,how='inner',on='inst_id_yr')
data = pan.concat([data,temp])
# columns name handling
data.rename(columns={'As_of_Year_x':'As_of_Year'},inplace=True)
data.drop('As_of_Year_y',axis=1,inplace=True)
#respondent name id
lbl = preprocessing.LabelEncoder()
lbl.fit(list(data['Respondent_Name'].values))
data['Respondent_Name_id'] = lbl.transform(list(data['Respondent_Name'].values))
x_attr.append('Respondent_Name_id')
if cats:
return data,x_attr,cat_attr
else:
return data
''' Pendiente definir si states, agency y resondent va como string o como int '''
def hmda_to_json(data, states='all', conventional_conforming='all',agency='all',respondent='all'):
"""
Create a text file witha list of json object containing the records aplying specific filters.
Parameters
----------
data: Resulting data object from hmda_init function
states: Opitional integer with unique state or list of integers with states to get. Default selects all States available
conventional_conforming: Optional integer or list indicating to export both or either one of Conventional
and Conforming loans.
Examples:
conventional_conforming ='all' fo get all loans. Default.
conventional_conforming ='Conventional' to get only conventional loan.
conventional_conforming = 'Conforming' to get only conforming loans.
conventional_conforming = 'Jumbo' to get only Jumbo loans.
agency: Optional list with agancy codes to filter the export.
respondent: Optional list with respondent ids to filter the export
Example:
hmda_to_json(data, states=11, conventional_conforming='all',agency=5,respondent=[5588,3601])
"""
# inputs validation
if type(data) != pan.core.frame.DataFrame:
try:
int('e')
except ValueError as e:
raise Exception('data must be a Pandas DataFrame') from e
if type(states) not in [list,str]:
try:
int('e')
except ValueError as e:
raise Exception('states must be a string with one state or a list of states') from e
if type(states) == int:
states=[states]
if type(conventional_conforming) not in [list,str]:
try:
int('e')
except ValueError as e:
raise Exception('staconventional_conformingtes must be a string ("Y" or "N") a list like ["Y","N"]') from e
if type(agency) not in [list,str]:
try:
int('e')
except ValueError as e:
raise Exception('agency must be a string with one agency code or a list of agency names') from e
if type(agency) == int:
agency=[agency]
if type(respondent) not in [list,str]:
try:
int('e')
except ValueError as e:
raise Exception('respondent must be a string with one respondent id (like "0000000384") or a list of respondent ids') from e
if type(respondent) == int:
respondent=[respondent]
# applying filter
#states = [11]
#data_f.groupby('Respondent_ID_num')['State_Code'].count()
if states != 'all':
data = data[data.State.isin(states)]
if conventional_conforming != 'all':
data = data[data.Conventional_Conforming_Flag.isin(conventional_conforming)]
#agency = [5] #data.Agency_Code.unique()
if agency != 'all':
data = data[data.Agency_Code_Description.isin(agency)]
# respondent = [5588,3601]
if respondent != 'all':
data = data[data.Respondent_ID.isin(respondent)]
data_j = data.to_json(orient='records')
with open('data.txt','w') as file:
file.write(data_j)
print('Done!')
data,x_attr,cat_attr = hmda_init(True)
data.columns
data[x_attr].describe().round(2)
data[cat_attr].describe(include='all')
data.Loan_Amount_000.describe().round(2)
dd = data.loc[(data.County_Name == "DISTRICT OF COL") & (data.State == 'DC' )]
dd.shape
sns.distplot(dd.Loan_Amount_000)
cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].mean()/data.Applicant_Income_000_num.mean()
ax = cnt.unstack().plot(kind='bar',figsize=(10,5))
ax.set_title("Borrower's indebtedness by State",fontsize=18)
ax.set_ylabel("Percentage",fontsize=15)
ax.set_xlabel("State",fontsize=15)
ax.legend(fontsize=12)
cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].mean()/data.Loan_Amount_000.mean()
ax = cnt.unstack().plot(kind='bar',figsize=(10,5))
ax.set_title("Loan amount Deviation form average",fontsize=18)
ax.set_ylabel("Percentage",fontsize=15)
ax.set_xlabel("State",fontsize=15)
ax.legend(fontsize=12)
data.Loan_Amount_000.plot.hist(logy=True,figsize=(10,6))
The logarithmic distribution of the loan amont is power law
data.plot.scatter(x='State_Code', y='Loan_Amount_000',figsize=(10,6))
data.plot.scatter(title='Loan amount by county',x='County_Code_num', y='Loan_Amount_000',figsize=(10,6))
There are some outliers in the data that need to be addressed. Since we have a vast amount of data we could replace the value with the mean of the similar values in terms of region, institution, and income of the borrower. On the other hand, I need to see if the data points are within Change Financial's region of interest.
data.plot.scatter(x='Conventional_Conforming_Flag_cat', y='Loan_Amount_000',figsize=(10,6))
No is zero, Yes is 1
outcome = pan.crosstab(data['Loan_Amount_000'], data['Conventional_Conforming_Flag'])
plt.show(outcome.plot(kind="bar", title = 'Loan Amount by C&C',figsize=(20,10)))
fig, ax = plt.subplots(figsize=(10, 6), ncols=1)
color_pallet= {0:'blue', 1:'green'}
ax.scatter(data['Applicant_Income_000_num'],
data['Loan_Amount_000'],
c=data['Conventional_Conforming_Flag_cat'].apply(lambda x: color_pallet[x]))
plt.show()
a = data.groupby('Loan_Amount_000_bin')['Loan_Amount_000_bin'].count()
#print('Numer of instances per bin: \n',a,'\n\n', '% of instances per bin: \n',
# a/loan_data.Loan_Amount_000.count(),end='\n')
print('% of instances per bin: \n',a/data.Loan_Amount_000.count())
data.groupby('Loan_Amount_000_bin')['Loan_Amount_000_bin'].count().plot.bar(title='Loan amount distribution', figsize=(10,6))
data.groupby(['State','County_Name'])['Loan_Amount_000'].describe()
For upcoming years we can create validations to identify outliers or instances that might be errors. Those validations will be based on previous data.
Validations to apply:
- Trends: are the values in the same trend as years before? The loan amount is moving accordingly.
- Outliers: house prices move within ranges, verify that new values are within expectation. Otherwise, data may be erroneous or an outlier.
- Comparing loan amount to income we get the a ratio of endebtness fo the borrower. Incorporing the Rate spread we may also stimate the installment and get the Debt to Income ratio DTI.
All validations may be filtered by institudion too.
inst_data,inst_names,inst_name = load_inst_data()
len(inst_data.Respondent_Name.unique())
# num de repodent is unicos
len(inst_data.Respondent_ID.unique())
maxl = 0
ina = ''
for i in inst_name:
if len(inst_name[i]) > maxl:
maxl = len(inst_name[i])
ina = i
print(maxl,inst_name[ina])
inst_data.loc[inst_data.Respondent_ID == '0000009788']
The names are unique to the key Agency_Code-Respondent_ID
This is an important risk attribute that shows payment capacity of a given borrower.
Validations:
- Data distribution. Is the income within the state and county ranges and expected trends at each level?
- Limit of detection. Set boundaries to determine possible errors or outliers in the data.
- Missing values. Is the number of missing values as in previous years, is it increasing/decreasing?
cnt
cnt = data.groupby(['State','As_of_Year'])['Applicant_Income_000_num'].mean()/data.Applicant_Income_000_num.mean()
ax = cnt.unstack().plot(kind='bar',figsize=(10,5))
ax.set_title("Income Deviation from Region Average",fontsize=18)
ax.set_ylabel("Percentage",fontsize=15)
ax.set_xlabel("State",fontsize=15)
ax.legend(fontsize=12)
This attribute provides information about potential risk. The shift in the composition of the applications and loans could represent an underlying recovery risk. When a loan has defaulted the proceedings are returned as according to the lien order.
cnt = data.groupby(['Agency_Code_Description','Lien_Status_Description'])['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cnt.plot.pie(subplots=True,figsize=(6,6),
title='',fontsize=13,autopct ='%1.1f%%',
explode = [0.01 for x in range(len(cnt))])
cnt = data.groupby(['Agency_Code_Description'])['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cnt.plot.pie(subplots=True,figsize=(6,6),
title='',fontsize=13,autopct ='%1.1f%%',
explode = [0.01 for x in range(len(cnt))])
cnt = data.groupby('State')['Loan_Amount_000'].count()/data.shape[0]
cnt.plot.pie(subplots=True,figsize=(6,6),
title='Loan distribution by State',fontsize=14,autopct ='%1.1f%%',
explode = [0.01 for x in range(len(cnt))])
cnt = data.groupby('State')['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cnt.plot.pie(subplots=True,figsize=(6,6),
title='Amount distribution by State',fontsize=14,autopct ='%1.1f%%',
explode = [0.01 for x in range(len(cnt))])
cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].sum()
cnt = cnt /data.groupby(['As_of_Year'])['Loan_Amount_000'].sum()
cnt.unstack().plot.pie(subplots=True,figsize=(20,5),
title='Market Share by Year & State',fontsize=17,autopct ='%1.1f%%')
cnt = data.groupby('State')['Loan_Amount_000'].count()/data.shape[0]
cnt2 = data.groupby('State')['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cn = (cnt2 - cnt)*100
print(cn)
cnt = data.groupby(['State','As_of_Year'])['Applicant_Income_000_num'].mean()
ax = cnt.unstack().plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Income by State & Year',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)
cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].mean()
ax = cnt.unstack().plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Loan Amount by State & Year',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)
cnt = data.groupby(['State','As_of_Year'])['Applicant_Income_000_num'].mean()
ax = cnt.unstack().plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Income by State & Year',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)
cnt = data.groupby(['State'])['Loan_Amount_000',
'Applicant_Income_000_num'].mean()
ax = cnt.plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Income and Loan amount by State ',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)
cnt =data.groupby(['As_of_Year'])['Loan_Amount_000']
print(cnt.sum(),cnt.count())
data.groupby(['State','As_of_Year'])['Loan_Amount_000'].sum().unstack().plot(
title='Loan Amount by State and Year',
kind='bar',
figsize=(10,5))
data.groupby(['Conventional_Conforming_Flag','As_of_Year'])['Loan_Amount_000'].sum().unstack().plot(
title='Loan Amount for Conventional & Conforming Loans by Year',
kind='bar',
figsize=(10,5))
data.groupby(['State','As_of_Year'])['FFIEC_Median_Family_Income_num'].mean().unstack().plot(
title='FFIEC Average Income by State and Year',
kind='bar',
figsize=(10,5))
cnt = data.groupby(['State','Conventional_Conforming_Flag'])['Loan_Amount_000'].count()
cnt = cnt / loan_data.shape[0]
print(cnt.round(2))
ax = cnt.unstack().plot(kind='bar',figsize=(10,5),fontsize=13)
ax.set_title('Market distribution by loan',fontsize=15)
ax.set_ylabel("% of number of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=12)
cnt = data.groupby(['State','Conventional_Conforming_Flag'])['Loan_Amount_000'].sum()
cnt = cnt / loan_data.Loan_Amount_000.sum()
print(cnt.round(2))
ax = cnt.unstack().plot(kind='bar',figsize=(10,5),fontsize=13)
ax.set_title('Market distribution by Amount',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=14)
ax.set_xlabel("State by CC Flag",fontsize=14)
# correlation plot using heatmap
cor = data.corr()
mask = np.zeros_like(cor, dtype=np.bool) # hiding upper triangle
mask[np.triu_indices_from(mask,k=1)] = True
f, ax = plt.subplots(figsize=(8, 8))
cmap = sns.diverging_palette(15, 150,center='light', as_cmap=True)
sns.heatmap(cor, mask=mask, cmap=cmap, vmax=.3,square=True, linewidths=1.15, cbar_kws={"shrink": .65}, ax=ax,)
scatter_matrix(loan_data[x_attr], alpha=0.2, figsize=(20, 10), diagonal='kde')
data.groupby(['State',
'Conventional_Conforming_Flag'])['Loan_Amount_000',
'Applicant_Income_000_num',
'FFIEC_Median_Family_Income_num'].mean().round(2).style.format("{:2}")
data.groupby(['Conventional_Conforming_Flag','As_of_Year'])['Loan_Amount_000',
'Applicant_Income_000_num'].mean().unstack().plot(title='Income and Loan Amount Conventional and Conforming',kind='bar',figsize=(10,5))
The average laon amount us increasing while the income of the target market has been declining over the past three years. This is potentially incresing stress on housholds payment capacity. In addition the number of
data.groupby(['Conventional_Conforming_Flag','As_of_Year'])['Loan_Amount_000'].count().unstack().plot(kind='bar',
figsize=(10,5))
def plot_PCA_by4(DTtrans,y_pred,c1=1,c2=2,c3=3,c4=4):
''' Plot the principal componets in a 2x2 column figure where the first row corresponds to the first
principal component vs the second and third components, and the second row is the second component vs
the third and fourth component by default.
Parameters
----------
DTtrans: Numpy array with numeric values.
y_pred: Cluster assigned to each instance in DTtrans
c1: Principal component to plot in y-axis/row 1. Default to 1.
c2: Principal component to plot in y-axis/row 2 and first comparison of c1. Default to 2.
c3: Principal component for comparison against c1 (row 1, column 2) and c2 (row 2 column 1). Default to 3.
c4: Principal component for comparison against c2. Default to 4.
'''
plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(DTtrans[:, c1], DTtrans[:, c2], c= y_pred)
plt.ylabel('PC1')
plt.xlabel('PC2')
plt.title('KMeans Clustering')
plt.subplot(222)
plt.scatter(DTtrans[:, c1], DTtrans[:, c3], c= y_pred)
plt.xlabel('PC3')
plt.title('KMeans Clustering')
plt.subplot(223)
plt.scatter(DTtrans[:, c2], DTtrans[:, c3], c= y_pred)
plt.ylabel('PC2')
plt.xlabel('PC3')
plt.subplot(224)
plt.scatter(DTtrans[:, c2], DTtrans[:, c4], c= y_pred)
plt.xlabel('PC4')
plt.show()
def Kmean_cluster(data,n_clusters=8, plot=True,n_components=None,normalize='zscore'):
'''
Performs nsupervized clustering using Kmeans and PCA for dimentionality reduction.
Returns the both Kmeans and PCA models, the data reduced by PCA and classifications
Parameters:
-----------
data: Panda dataframe with the merge data
n_clusters: number of clusters to create, deaulted to skleanr.Kmeans value of 8,
plot: Bolean, To create cluster plot. Default to True
n_components: Number of component to be created by PCA. Default to the number of attibutes pass in data
normalize: String with normalize function to apply 'zscore' or 'minmax'. Default == 'zscore'
'''
#execute minmax normalization
minmax = lambda x: ((x-x.min())/(x.max()-x.min()) * (100-0) + 0) #if x.dtypes in [np.float64,no.int64] else x
zscore = lambda x: (x - x.mean()) / x.std()
if normalize == 'zscore':
data_norm = data[x_attr].apply(zscore) # applying function
else:
data_norm = data[x_attr].apply(minmax) # applying function
# remove null data
data_norm.dropna(inplace=True)
colnames= list(data_norm.columns)
# PCA
if n_components == None:
n_components = len(colnames)
pca = decomposition.PCA(n_components=n_components)
# reducing dimensionality
DTtrans = pca.fit(data_norm).transform(data_norm)
clusters = KMeans(n_clusters=n_clusters, random_state=33).fit_predict(DTtrans)
kmeans = KMeans()
kmeans.fit_predict(DTtrans)
# variance captured by component
print ('Variance ratio provided by component: \n', pca.explained_variance_ratio_ , '\n')
# we need to take the first 12 component to get 95% of the variance
print ('Percentage Variance Captured with:')
for i in range(1,len(colnames)+1):
print ('\t %i components %0.1f' % (i,100*sum(pca.explained_variance_ratio_[:i])))
plt.figure(1, figsize=(7, 5))
plt.clf()
plt.plot(pca.explained_variance_, linewidth=3)
plt.xlabel('Number of components')
plt.ylabel('Explained variance')
print('\nTotal number of clusters created: %i' %(len(kmeans.cluster_centers_)))
if plot:
plot_PCA_by4(DTtrans,y_pred)
return kmeans, DTtrans, pca, clusters
kmeans, DTtrans, pca, clusters = Kmean_cluster(data,n_clusters=20, plot=True,n_components=None,normalize='minmax')
# minmax normalization
centroids = pan.DataFrame(kmeans.cluster_centers_, columns=x_attr)
name = list(centroids.columns)
for k in range(len(pca.components_)):
print('PC %i:'%(k+1))
for i in range(len(pca.components_[0])):
if abs(pca.components_[k][i])>=.35:
print('\t%.2f \t %s'%(pca.components_[k][i],name[i]))
centroids.describe()
datos_mm =[]
for col in data.columns:
if data[col].dtype in [int,float]:
temp =[]
temp.append(col)
temp.append(data[col].mean())
temp.append(data[col].std())
datos_mm.append(temp)
datos_mm = pan.DataFrame(datos_mm,columns=['attr','min','max'])
datos_mm.to_csv('Centroids minmax vals.csv',index=False)
kmeans, DTtrans, pca, clusters = Kmean_cluster(data,n_clusters=20, plot=True,n_components=None,normalize='zscore')
# zscore normalization
centroids = pan.DataFrame(kmeans.cluster_centers_, columns=x_attr)
name = list(centroids.columns)
for k in range(len(pca.components_)):
print('PC %i:'%(k+1))
for i in range(len(pca.components_[0])):
if abs(pca.components_[k][i])>=.35:
print('\t%.2f \t %s'%(pca.components_[k][i],name[i]))
datos_mm =[]
for col in data.columns:
if data[col].dtype in [int,float]:
temp =[]
temp.append(col)
temp.append(data[col].mean())
temp.append(data[col].std())
datos_mm.append(temp)
datos_mm = pan.DataFrame(datos_mm,columns=['attr','mean','std'])
data.loc[data.Respondent_Name =='WELLS FARGO BANK']
a = data.MSA_MD_num.unique()
a.sort()
a
for i in a:
if np.isnan(i) == False:
print(data.loc[data.MSA_MD_num == i].State.unique())