Julio Sotelo | Capital One Data Challenge

In [1]:
import itertools, collections # ad increments to a index of loop
import pandas as pan
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.tools.plotting import scatter_matrix
from sklearn.tree import DecisionTreeRegressor # bining
from sklearn import preprocessing # Categorical attr to numeric
from sklearn.cluster import KMeans
from sklearn import decomposition
#plt.style.use('ggplot')
plt.style.use('fivethirtyeight')
%matplotlib inline

Loading institutions data

In [15]:
def load_inst_data():
    """ Loads 2012_to_2014_institutions_data file. Adding a new field with the unified
    name of the respondant institution. In addition it creates a dictionary
    with the unique id and the name given to each institution
    
    """

    ' institutions data'
    inst_data = pan.read_csv('2012_to_2014_institutions_data.csv',low_memory=False,na_values=['',' ',])

    # setting Id for institution and year
    inst_data['inst_id_yr'] = inst_data.apply(lambda row: str(row['Agency_Code'])+'-'+str(row['Respondent_ID']),axis=1)    
    
    # unique responders    
    inst_ids =  list(inst_data.inst_id_yr.unique())
    # names given
    inst_idn = inst_data[['inst_id_yr','Respondent_Name_TS']]
    
    inst_id_names = {}
    for i in inst_ids:
        names = inst_idn.loc[inst_idn.inst_id_yr == i]    
        inst_id_names[i] = list(names.Respondent_Name_TS.unique())
    
    # stop words to delete from the institution name
    stop_words = ['INC','CO','CIA','LLC','CORP','NA','FCU','CFCU']
    
    #dictionary of institutions names and list of all known names
    inst_name = {} # key is respondent id, value name given
    for inst in inst_id_names:
        # inst='7-34-2000096'
        # select the shortest name
        uname = min(inst_id_names[inst],key=len)
        # removing punctuation 
        name = uname.replace('.', '')
        name = name.replace(',', '')
        name = name.replace("'", '')
        # name to list
        temp = [x.strip().upper() for x in name.split(' ')] 
        # locating stop words index
        dtemp =[]
        for i in range(len(temp)):   
            try:
                if temp[i] in stop_words:
                    dtemp.append(i)
            except:
                continue
        # removing stop words
        for i in range(len(dtemp)): 
            temp.pop(dtemp[i])
            if i < len(dtemp)-1:
                dtemp[i+1] -=1
            
        # record names
        name = " ".join(x for x in temp)
        if inst not in inst_name:
            inst_name[inst] = name
       
    
    inst_data['Respondent_Name'] = inst_data.apply(lambda row: inst_name[row['inst_id_yr']],axis=1)
    
    
    return inst_data,inst_name

Binning function

In [16]:
# discretizing data function
# since i have outliers i will create the bins considering the spread from 2 standard deviations from de median
def Create_bins(data,variable='Loan_Amount_000',bin_name = None,quantiy_label=False,num_bins=None,
                regression=False,attributes=None,max_depth=None,min_samples_leaf=10000):
    """ Creates bins automatically using two aproaches:
    
    
        1) Bining to a given number of bins. Calculated using statistics of data within the IQR outliers 
            identification method [Q2-1.IQR,Q3+1.5IQR]. Doing so we eliminate the noise of possible outliers.
        2) Binig by regression using decision trees.
        
        Parameters
        ----------
        data: Panda Dataframe tha contains the variable and where the bin class of each instance will be set
        
        variable: column name of the data what would be discretize.
        
        bin_name: Column name where the bin will be stored in the dataframe. Default: variable_bin 
            i.ei I f variable = 'loan_amnt' then bin_name = loan_amnt_bin
        
        quantiy_label: Optional label to be added to each bin label ( %, k, $, etc).
        
        num_bins: number of bins to be created, ignore when regression is True.
        
        regression: Bolean, use decision trees to calculate bins. True / False (default).
        
        attributes: List with columns names of the data provided to by use in the regressor tree.
        
        max_depth : int or None, optional (default=int(round(len(data)**.5)))
           The maximum depth of the tree. If None, then nodes are expanded until
           all leaves are pure or until all leaves contain less than
           min_samples_split samples.:
           
        min_samples_split : int, float, optional (default=2)
          The minimum number of samples required to split an internal node
        
    """
    #data validation
    if type(variable) != str:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('variable must be a string') from e        
    if regression == True:
        # provigind a valid list of attributes
        try: 
            len(attributes)
        except TypeError as e:
            raise Exception('''Attributes must be provided as a list containing the names of the columns to be use 
                            by the decision tree''') from e
        # valid number 
        if max_depth == None:
            max_depth=int(round(len(data)**.5))
        else:
            try:
                int(max_depth)
            except ValueError as e:
                raise Exception('max_depth must be an integer') from e
        # valid number 
        try:
            int(min_samples_leaf)
        except ValueError as e:
                raise Exception('min_samples_leaf must be an integer') from e
    else: 
        # bin by fixed number
        # valid number 
        if num_bins != None:
            try:
                int(num_bins)
            except ValueError as e:
                    raise Exception('num_bins must be an integer') from e

    # bin creation
    if regression == True:
        variable= 'Loan_Amount_000'
        #x = data[data.columns.difference([variable])].copy()
        x = loan_data[attributes].copy()
        x.dropna(inplace=True)
        y = x.pop(variable)          
        DTC = DecisionTreeRegressor(max_depth=max_depth,min_samples_leaf=min_samples_leaf)
        DTC.fit(x,y) 
        thrs_out = np.unique(DTC.tree_.threshold[DTC.tree_.feature > -2] )
        bins = np.sort(thrs_out)
        ln = len(thrs_out)
        labels = []
        for i in range(1,ln):
            if i == ln:
                labels.append('+'+str(round(thrs_out[i-1],2))+str(quantiy_label if quantiy_label != None else ""))
            else:
                labels.append(str(round(thrs_out[i-1],2))+'-'+str(round(thrs_out[i],2))+
                              str(quantiy_label if quantiy_label != None else ""))
    else:
        # number of bins tobe created
        num_bins = num_bins if num_bins != None else 40
        IQR = data[variable].describe()[-2]- data[variable].describe()[-4]
        low = max(0,(data[variable].describe()[-4] - IQR*1.5))
        hig = (data[variable].describe()[-2] + IQR*1.5)
        cut = (hig - low )/ num_bins
        # starting point of bins
        bins = [0] 
        for i in range(num_bins):
            bins.append(int(bins[i]+cut))
        labels = []
        for i in range(1,num_bins+1):
            if i == num_bins:
                labels.append('+'+str(bins[i-1])+str(quantiy_label if quantiy_label != None else ""))
            else:
                labels.append(str(bins[i-1])+'-'+str(bins[i])+str(quantiy_label if quantiy_label != None else ""))
    # setting bin in each instance
    if bin_name == None:
        bin_name = variable+'_bin'
        
    data[bin_name] = pan.cut(data[variable],bins,labels=labels)
    
    return data

Loading Loan Data

In [26]:
def load_loan_data(variable_to_bin='Loan_Amount_000',bin_name = None,quantiy_label=False,num_bins=None,
                regression=False,attributes=None,max_depth=None,min_samples_leaf=10000):
    ''' Loads loan data converting to null any value like  ""  or " ". 

    Returns a panda dataframe 

     '''

    loan_data = pan.read_csv('2012_to_2014_loans_data.csv',na_values=['',' ',],low_memory=False)

    '---------------- ---------------- ---------------- ----------------'
    ' data preprocessing ' 
    loan_data.dtypes

    # converting object to numeric values
    col_to_numeric =['Applicant_Income_000',
                    'Census_Tract_Number',
                    'County_Code',
                    'FFIEC_Median_Family_Income',
                    'MSA_MD',
                    'Number_of_Owner_Occupied_Units',
                    'Respondent_ID',
                    'Tract_to_MSA_MD_Income_Pct']

    for var in col_to_numeric:
        loan_data[var+'_num'] = pan.to_numeric(loan_data[var],errors='coerce')

    # numeric attributes
    num_attr = [x for x in loan_data.columns if '_num' in x]
    num_attr.extend(['Agency_Code','Loan_Amount_000','As_of_Year','Sequence_Number',
                     'State_Code','Conforming_Limit_000'])


    #categorical attributes
    cat_attr = ['Loan_Purpose_Description',
                'Lien_Status_Description',
                'Loan_Type_Description',
                'State','County_Name',
                'Conventional_Status',
                'Conforming_Status',
                'Conventional_Conforming_Flag']

    # list with cat attributes with descriptions of a numerical attribute:
    desc_attr = ['MSA_MD_Description','Agency_Code_Description']

    # creating numerical values for all categorical attributes
    cat_attr_num = []
    for attr in cat_attr:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(loan_data[attr].values))
        loan_data[attr+'_cat'] = lbl.transform(list(loan_data[attr].values))
        cat_attr_num.append(attr+'_cat')



    x_attr = num_attr.copy()
    x_attr.extend(cat_attr_num)

    # id for loan institution
    loan_data['inst_id_yr'] = loan_data.apply(lambda row: str(row['Agency_Code'])+'-'+str(row['Respondent_ID']),axis=1)
    
    loan_data = Create_bins(loan_data,
                   variable='Loan_Amount_000' if variable_to_bin == 'Loan_Amount_000' else variable_to_bin,
                   bin_name = None if bin_name == None else bin_name,
                   quantiy_label='k' if quantiy_label == False else quantiy_label,
                   num_bins=40 if num_bins == None else num_bins ,
                   regression=False if regression == False else True,
                   attributes=x_attr if attributes == None else attributes,
                   max_depth=None if max_depth == None else max_depth,
                   min_samples_leaf=10000 if min_samples_leaf==None else min_samples_leaf)

    return loan_data, x_attr, cat_attr

Merging data & returning Panda Dataframe

In [27]:
def hmda_init(cats=False,variable_to_bin='Loan_Amount_000',bin_name = None,quantiy_label=False,num_bins=None,
                regression=False,attributes=None,max_depth=None,min_samples_leaf=10000):
    ''' Merging institutions and loan data using load_loan_data() and load_inst_data() functions.
    Both source files mus be located in the same directory where the Python instance is running. In addition, 
    both files must be named as shown bellow.
    
        Institutions file name : 2012_to_2014_institutions_data.csv
        Loans file names       : 2012_to_2014_loans_data.csv

    This functios returns the following objects: 
        - By default returns data, a panda dataframe with the loans and institutions data merged.
        - Optional to get two additional list x_attr and cat_attr, containing the list of names of numerical 
            attributes and categorical attributes respectively.
            
    Parameters:
    ===========
    cats: Bolean, when True the list x_attr and cat_attr are returned. Drfault to False.
    
    data: Panda Dataframe tha contains the variable and where the bin class of each instance will be set
        
    variable: String with column name of the data what would be discretize.

    bin_name: String with name where the bin will be stored in the dataframe. Default: variable+'_bin'
        i.ei I f variable = 'loan_amnt' then bin_name = loan_amnt_bin

    quantiy_label: Optional string label to be added to each bin label ( %, k, $, etc). Default to "k"

    num_bins: number of bins to be created, ignore when regression is True.

    regression: Bolean, use decision trees to calculate bins. True / False(default).

    attributes: List with columns names as strings of the data provided to by use in the regressor tree.

    max_depth : int or None, optional (default=int(round(len(data)**.5)))
       The maximum depth of the tree. If None, then nodes are expanded until
       all leaves are pure or until all leaves contain less than
       min_samples_split samples.:

    min_samples_split : int, float, optional (default=2)
      The minimum number of samples required to split an internal node
        
    '''
    # institutions data
    inst_data,inst_name = load_inst_data()
    # no need of the colums in merge
    inst_data.drop('Agency_Code',axis=1,inplace=True)
    inst_data.drop('Respondent_ID',axis=1,inplace=True)

    # loan data
    loan_data, x_attr, cat_attr = load_loan_data()

    # data has the datat gerge
    data = pan.DataFrame()

    # for each year set the institution data
    for year in loan_data.As_of_Year.unique():
        d = loan_data.query('As_of_Year == @year').copy()
        i = inst_data.query('As_of_Year == @year').copy()
        temp= pan.merge(left=d,right=i,how='inner',on='inst_id_yr')
        data = pan.concat([data,temp])
    
    # columns name handling
    data.rename(columns={'As_of_Year_x':'As_of_Year'},inplace=True)
    data.drop('As_of_Year_y',axis=1,inplace=True)
    
    #respondent name id
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(data['Respondent_Name'].values))
    data['Respondent_Name_id'] = lbl.transform(list(data['Respondent_Name'].values))
    x_attr.append('Respondent_Name_id')
    
    if cats:
        return data,x_attr,cat_attr
    else:
        return data

Data Json Export

In [28]:
''' Pendiente definir si states, agency y resondent va como string o como int '''


def hmda_to_json(data, states='all', conventional_conforming='all',agency='all',respondent='all'):
    """
    Create a text file witha list of json object containing the records aplying specific filters.
    
    Parameters
    ----------
    
    data: Resulting data object from hmda_init function
    
    states: Opitional integer with unique state or list of integers with states to get. Default selects all States available
    
    conventional_conforming:  Optional integer or list indicating to export both or either one of Conventional 
            and Conforming loans.
            Examples:
                conventional_conforming ='all' fo get all loans. Default.
                conventional_conforming ='Conventional' to get only conventional loan.
                conventional_conforming = 'Conforming' to get only conforming loans.
                conventional_conforming = 'Jumbo' to get only Jumbo loans.
    
    agency: Optional list with agancy codes to filter the export.
    
    respondent: Optional list with respondent ids to filter the export
    
    Example:
        
        hmda_to_json(data, states=11, conventional_conforming='all',agency=5,respondent=[5588,3601])
    
    """

    # inputs validation
    if type(data) != pan.core.frame.DataFrame:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('data must be a Pandas DataFrame') from e        
    
    if type(states) not in [list,str]:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('states  must be a string with one state or a list of states') from e        
    if type(states) ==  int: 
            states=[states]

    if type(conventional_conforming) not in [list,str]:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('staconventional_conformingtes must be a string ("Y" or "N") a list like ["Y","N"]') from e

    if type(agency) not in [list,str]:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('agency must be a string with one agency code or a list of agency names') from e
    if type(agency) ==  int: 
            agency=[agency]
        
    if type(respondent) not in [list,str]:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('respondent must be a string with one respondent id (like "0000000384") or a list of respondent ids') from e
    if type(respondent) ==  int: 
            respondent=[respondent]
        
        
    # applying filter
    #states = [11]
    #data_f.groupby('Respondent_ID_num')['State_Code'].count()
    
    if states != 'all':
        data = data[data.State.isin(states)]
    if conventional_conforming != 'all':
        data = data[data.Conventional_Conforming_Flag.isin(conventional_conforming)]
    #agency = [5] #data.Agency_Code.unique()
    if agency != 'all':
        data = data[data.Agency_Code_Description.isin(agency)]
    # respondent = [5588,3601]
    if respondent != 'all':
        data = data[data.Respondent_ID.isin(respondent)]
    
    data_j = data.to_json(orient='records')
    
    with open('data.txt','w') as file:
        file.write(data_j)


    print('Done!')
In [29]:
data,x_attr,cat_attr = hmda_init(True)
In [30]:
data.columns
Out[30]:
Index(['Agency_Code', 'Applicant_Income_000', 'As_of_Year',
       'Census_Tract_Number', 'County_Code', 'FFIEC_Median_Family_Income',
       'Loan_Amount_000', 'MSA_MD', 'Number_of_Owner_Occupied_Units',
       'Respondent_ID', 'Sequence_Number', 'State_Code',
       'Tract_to_MSA_MD_Income_Pct', 'MSA_MD_Description',
       'Loan_Purpose_Description', 'Agency_Code_Description',
       'Lien_Status_Description', 'Loan_Type_Description', 'State',
       'County_Name', 'Conforming_Limit_000', 'Conventional_Status',
       'Conforming_Status', 'Conventional_Conforming_Flag',
       'Applicant_Income_000_num', 'Census_Tract_Number_num',
       'County_Code_num', 'FFIEC_Median_Family_Income_num', 'MSA_MD_num',
       'Number_of_Owner_Occupied_Units_num', 'Respondent_ID_num',
       'Tract_to_MSA_MD_Income_Pct_num', 'Loan_Purpose_Description_cat',
       'Lien_Status_Description_cat', 'Loan_Type_Description_cat', 'State_cat',
       'County_Name_cat', 'Conventional_Status_cat', 'Conforming_Status_cat',
       'Conventional_Conforming_Flag_cat', 'inst_id_yr', 'Loan_Amount_000_bin',
       'Respondent_Name_TS', 'Respondent_City_TS', 'Respondent_State_TS',
       'Respondent_ZIP_Code', 'Parent_Name_TS', 'Parent_City_TS',
       'Parent_State_TS', 'Parent_ZIP_Code', 'Assets_000_Panel',
       'Respondent_Name', 'Respondent_Name_id'],
      dtype='object')

Loan Data - Data Review

In [31]:
data[x_attr].describe().round(2)
/Users/Julio/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[31]:
Applicant_Income_000_num Census_Tract_Number_num County_Code_num FFIEC_Median_Family_Income_num MSA_MD_num Number_of_Owner_Occupied_Units_num Respondent_ID_num Tract_to_MSA_MD_Income_Pct_num Agency_Code Loan_Amount_000 ... Conforming_Limit_000 Loan_Purpose_Description_cat Lien_Status_Description_cat Loan_Type_Description_cat State_cat County_Name_cat Conventional_Status_cat Conforming_Status_cat Conventional_Conforming_Flag_cat Respondent_Name_id
count 1203305.00 1319481.00 1320321.00 1319481.00 1225941.00 1319284.00 9.106370e+05 1318445.00 1321158.00 1321158.00 ... 1320321.00 1321158.00 1321158.00 1321158.00 1321158.00 1321158.00 1321158.00 1321158.00 1321158.00 1321158.00
mean 118.02 3897.81 144.59 87671.39 35884.43 1383.55 1.379205e+09 117.17 6.85 290.23 ... 532.91 0.64 0.01 0.59 2.51 92.77 0.30 0.05 0.67 753.78
std 122.70 3357.40 229.67 19541.82 15166.98 565.35 2.791193e+09 37.76 2.49 965.78 ... 86.91 0.48 0.12 1.05 0.83 59.17 0.46 0.21 0.47 404.34
min 0.00 1.00 1.00 45300.00 12580.00 4.00 4.700000e+01 4.96 1.00 1.00 ... 417.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
25% NaN NaN NaN NaN NaN NaN NaN NaN 7.00 153.00 ... NaN 0.00 0.00 0.00 2.00 46.00 0.00 0.00 0.00 407.00
50% NaN NaN NaN NaN NaN NaN NaN NaN 7.00 235.00 ... NaN 1.00 0.00 0.00 3.00 91.00 0.00 0.00 1.00 819.00
75% NaN NaN NaN NaN NaN NaN NaN NaN 9.00 347.00 ... NaN 1.00 0.00 1.00 3.00 140.00 1.00 0.00 1.00 1110.00
max 9999.00 9902.00 840.00 113400.00 49020.00 3632.00 9.731401e+09 350.67 9.00 99625.00 ... 625.00 1.00 1.00 3.00 4.00 210.00 1.00 1.00 1.00 1335.00

8 rows × 23 columns

In [32]:
data[cat_attr].describe(include='all')
Out[32]:
Loan_Purpose_Description Lien_Status_Description Loan_Type_Description State County_Name Conventional_Status Conforming_Status Conventional_Conforming_Flag
count 1321158 1321158 1321158 1321158 1320321 1321158 1321158 1321158
unique 2 2 4 5 210 2 2 2
top Refinance First Lien Conventional VA FAIRFAX Conventional Conforming Y
freq 843030 1301377 929170 679703 125375 929170 1258551 882206

Quality Check

a - Loan Amount

In [33]:
data.Loan_Amount_000.describe().round(2)
Out[33]:
count    1321158.00
mean         290.23
std          965.78
min            1.00
25%          153.00
50%          235.00
75%          347.00
max        99625.00
Name: Loan_Amount_000, dtype: float64
In [168]:
dd = data.loc[(data.County_Name == "DISTRICT OF COL") & (data.State == 'DC' )]
dd.shape
sns.distplot(dd.Loan_Amount_000)
/Users/Julio/anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[168]:
<matplotlib.axes._subplots.AxesSubplot at 0x2e3b4d160>
In [180]:
cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].mean()/data.Applicant_Income_000_num.mean()
ax = cnt.unstack().plot(kind='bar',figsize=(10,5))
ax.set_title("Borrower's indebtedness by State",fontsize=18)
ax.set_ylabel("Percentage",fontsize=15)
ax.set_xlabel("State",fontsize=15)
ax.legend(fontsize=12)
Out[180]:
<matplotlib.legend.Legend at 0x17e00e5f8>
In [ ]:
cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].mean()/data.Loan_Amount_000.mean()
ax = cnt.unstack().plot(kind='bar',figsize=(10,5))
ax.set_title("Loan amount Deviation form average",fontsize=18)
ax.set_ylabel("Percentage",fontsize=15)
ax.set_xlabel("State",fontsize=15)
ax.legend(fontsize=12)
In [31]:
data.Loan_Amount_000.plot.hist(logy=True,figsize=(10,6))
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x116864358>

The logarithmic distribution of the loan amont is power law

In [13]:
data.plot.scatter(x='State_Code', y='Loan_Amount_000',figsize=(10,6))
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x11474e2b0>
In [162]:
data.plot.scatter(title='Loan amount by county',x='County_Code_num', y='Loan_Amount_000',figsize=(10,6))
Out[162]:
<matplotlib.axes._subplots.AxesSubplot at 0x2c5e7f128>

There are some outliers in the data that need to be addressed. Since we have a vast amount of data we could replace the value with the mean of the similar values in terms of region, institution, and income of the borrower. On the other hand, I need to see if the data points are within Change Financial's region of interest.

In [76]:
data.plot.scatter(x='Conventional_Conforming_Flag_cat', y='Loan_Amount_000',figsize=(10,6))
Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x141788400>

No is zero, Yes is 1

In [100]:
outcome = pan.crosstab(data['Loan_Amount_000'], data['Conventional_Conforming_Flag'])
plt.show(outcome.plot(kind="bar", title = 'Loan Amount by C&C',figsize=(20,10)))
In [90]:
fig, ax = plt.subplots(figsize=(10, 6), ncols=1)
color_pallet= {0:'blue', 1:'green'}
ax.scatter(data['Applicant_Income_000_num'], 
           data['Loan_Amount_000'], 
            c=data['Conventional_Conforming_Flag_cat'].apply(lambda x: color_pallet[x]))
plt.show()
In [152]:
a = data.groupby('Loan_Amount_000_bin')['Loan_Amount_000_bin'].count()
#print('Numer of instances per bin: \n',a,'\n\n', '% of instances per bin: \n',
#    a/loan_data.Loan_Amount_000.count(),end='\n')
print('% of instances per bin: \n',a/data.Loan_Amount_000.count())
data.groupby('Loan_Amount_000_bin')['Loan_Amount_000_bin'].count().plot.bar(title='Loan amount distribution', figsize=(10,6))
% of instances per bin: 
 Loan_Amount_000_bin
0-15k       0.005768
15-30k      0.004699
30-45k      0.008734
45-60k      0.016361
60-75k      0.021453
75-90k      0.027160
90-105k     0.032468
105-120k    0.036326
120-135k    0.042062
135-150k    0.046709
150-165k    0.046907
165-180k    0.048634
180-195k    0.045571
195-210k    0.047084
210-225k    0.044677
225-240k    0.043001
240-255k    0.041436
255-270k    0.037519
270-285k    0.035386
285-300k    0.034377
300-315k    0.028371
315-330k    0.027430
330-345k    0.025142
345-360k    0.025046
360-375k    0.021788
375-390k    0.022448
390-405k    0.025735
405-420k    0.034014
420-435k    0.006277
435-450k    0.007967
450-465k    0.007627
465-480k    0.008243
480-495k    0.009088
495-510k    0.007244
510-525k    0.006326
525-540k    0.006169
540-555k    0.005393
555-570k    0.005341
570-585k    0.004746
+585k       0.005318
Name: Loan_Amount_000_bin, dtype: float64
Out[152]:
<matplotlib.axes._subplots.AxesSubplot at 0x125b5e860>
In [163]:
data.groupby(['State','County_Name'])['Loan_Amount_000'].describe()
Out[163]:
State  County_Name           
DC     DISTRICT OF COL  count    49451.000000
                        mean       414.596125
                        std        285.096230
                        min          4.000000
                        25%        248.000000
                        50%        364.000000
                        75%        520.000000
                        max       8000.000000
DE     KENT             count    11087.000000
                        mean       188.254623
                        std         78.626215
                        min          1.000000
                        25%        135.000000
                        50%        183.000000
                        75%        235.000000
                        max       1760.000000
       NEW CASTLE       count    38038.000000
                        mean       212.860797
                        std        120.023385
                        min          1.000000
                        25%        142.000000
                        50%        196.000000
                        75%        264.000000
                        max       4000.000000
       SUSSEX           count    12468.000000
                        mean       211.522538
                        std        120.788651
                        min          4.000000
                        25%        137.000000
                        50%        190.000000
                                     ...     
WV     WETZEL           std       4407.141259
                        min         11.000000
                        25%         60.000000
                        50%         91.000000
                        75%        129.750000
                        max      77637.000000
       WIRT             count      119.000000
                        mean       103.630252
                        std         56.208731
                        min         17.000000
                        25%         63.000000
                        50%         92.000000
                        75%        133.500000
                        max        320.000000
       WOOD             count     4424.000000
                        mean       187.750452
                        std       1971.687711
                        min          5.000000
                        25%         70.000000
                        50%        102.000000
                        75%        150.000000
                        max      92355.000000
       WYOMING          count      224.000000
                        mean       485.558036
                        std       6005.467877
                        min         13.000000
                        25%         50.000000
                        50%         73.500000
                        75%        102.750000
                        max      89963.000000
Name: Loan_Amount_000, dtype: float64

For upcoming years we can create validations to identify outliers or instances that might be errors. Those validations will be based on previous data.

Validations to apply:
    - Trends: are the values in the same trend as years before? The loan amount is moving accordingly.
    - Outliers: house prices move within ranges, verify that new values are within expectation. Otherwise, data may be erroneous or an outlier.
    - Comparing loan amount to income we get the a ratio of endebtness fo the borrower. Incorporing the Rate spread we may also stimate the installment and get the Debt to Income ratio DTI.

 All validations may be filtered by institudion too.

Respondent Name Review

In [3]:
inst_data,inst_names,inst_name = load_inst_data()
len(inst_data.Respondent_Name.unique())
Out[3]:
8269
In [10]:
# num de repodent is unicos
len(inst_data.Respondent_ID.unique())
Out[10]:
7914
In [9]:
maxl = 0
ina = ''
for i in inst_name:
    if len(inst_name[i]) > maxl:
        maxl = len(inst_name[i])
        ina = i
print(maxl,inst_name[ina])
5 ['AMERICAN BANK, NA', 'AMERICAN BANK N.A.', 'AMERICAN BANK', 'AMERICAN BANK NA', 'AMERICAN BANK, N.A.']
In [14]:
inst_data.loc[inst_data.Respondent_ID == '0000009788']
Out[14]:
As_of_Year Respondent_ID Agency_Code Respondent_Name_TS Respondent_City_TS Respondent_State_TS Respondent_ZIP_Code Parent_Name_TS Parent_City_TS Parent_State_TS Parent_ZIP_Code Assets_000_Panel Respondent_Name inst_id_yr
201 2012 0000009788 1 HERGET BANK, N.A. PEKIN IL 61555-0839 NaN NaN NaN NaN 267337 HERGET BANK 1-0000009788
2134 2012 0000009788 3 BANK OF BLOOMSDALE BLOOMSDALE MO 63627 NaN NaN NaN NaN 193336 BANK OF BLOOMSDALE 3-0000009788
4925 2012 0000009788 5 EGLIN FEDERAL CREDIT UNION FORT WALTON BEACH FL 32547 EGLIN FEDERAL CREDIT UNION FORT WALTON BEACH FL 32547 1408377 EGLIN FEDERAL CREDIT UNION 5-0000009788
7592 2013 0000009788 1 HERGET BANK, N.A. PEKIN IL 61555-0839 HERGET FINANCIAL CORP. PEKIN IL 61555-0839 259069 HERGET BANK 1-0000009788
9437 2013 0000009788 3 BANK OF BLOOMSDALE BLOOMSDALE MO 63627 NaN NaN NaN NaN 194346 BANK OF BLOOMSDALE 3-0000009788
12177 2013 0000009788 5 EGLIN FEDERAL CREDIT UNION FORT WALTON BEACH FL 32547 NaN NaN NaN NaN 1487189 EGLIN FEDERAL CREDIT UNION 5-0000009788
14774 2014 0000009788 1 HERGET BANK NATIONAL ASSOCIATI PEKIN IL 61554 NaN NaN NaN NaN 273992 HERGET BANK NATIONAL ASSOCIATI 1-0000009788
16590 2014 0000009788 3 BLOOMSDALE BANK BLOOMSDALE MO 63627 NaN NaN NaN NaN 190798 BLOOMSDALE BANK 3-0000009788
19246 2014 0000009788 5 EGLIN FEDERAL CREDIT UNION FORT WALTON BEACH FL 32547 NaN NaN NaN NaN 1557516 EGLIN FEDERAL CREDIT UNION 5-0000009788

The names are unique to the key Agency_Code-Respondent_ID

Applicant_Income_000 Review

This is an important risk attribute that shows payment capacity of a given borrower.

Validations:

- Data distribution. Is the income within the state and county ranges and expected trends at each level?
- Limit of detection. Set boundaries to determine possible errors or outliers in the data.
- Missing values. Is the number of missing values as in previous years, is it increasing/decreasing?
In [160]:
cnt
Out[160]:
State  As_of_Year
DC     2012          1.542059
       2013          1.454827
       2014          1.502308
DE     2012          0.805354
       2013          0.786691
       2014          0.757506
MD     2012          1.073355
       2013          1.025305
       2014          0.999843
VA     2012          1.020571
       2013          0.975942
       2014          0.948129
WV     2012          0.732593
       2013          0.717106
       2014          0.682683
Name: Applicant_Income_000_num, dtype: float64
In [139]:
cnt = data.groupby(['State','As_of_Year'])['Applicant_Income_000_num'].mean()/data.Applicant_Income_000_num.mean()
ax = cnt.unstack().plot(kind='bar',figsize=(10,5))
ax.set_title("Income Deviation from Region Average",fontsize=18)
ax.set_ylabel("Percentage",fontsize=15)
ax.set_xlabel("State",fontsize=15)
ax.legend(fontsize=12)
Out[139]:
<matplotlib.legend.Legend at 0x12e29b1d0>

Lien_Status

This attribute provides information about potential risk. The shift in the composition of the applications and loans could represent an underlying recovery risk. When a loan has defaulted the proceedings are returned as according to the lien order.

Market Description

In [161]:
cnt = data.groupby(['Agency_Code_Description','Lien_Status_Description'])['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cnt.plot.pie(subplots=True,figsize=(6,6),
             title='',fontsize=13,autopct ='%1.1f%%',
             explode = [0.01 for x in range(len(cnt))])
Out[161]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x13b5cca58>], dtype=object)
In [55]:
cnt = data.groupby(['Agency_Code_Description'])['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cnt.plot.pie(subplots=True,figsize=(6,6),
             title='',fontsize=13,autopct ='%1.1f%%',
             explode = [0.01 for x in range(len(cnt))])
Out[55]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x10ea7d588>], dtype=object)
In [56]:
cnt = data.groupby('State')['Loan_Amount_000'].count()/data.shape[0]
cnt.plot.pie(subplots=True,figsize=(6,6),
             title='Loan distribution by State',fontsize=14,autopct ='%1.1f%%',
             explode = [0.01 for x in range(len(cnt))])
Out[56]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x1157e66a0>], dtype=object)
In [106]:
cnt = data.groupby('State')['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cnt.plot.pie(subplots=True,figsize=(6,6),
             title='Amount distribution by State',fontsize=14,autopct ='%1.1f%%',
             explode = [0.01 for x in range(len(cnt))])
Out[106]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x126039898>], dtype=object)
In [141]:
cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].sum()
cnt = cnt /data.groupby(['As_of_Year'])['Loan_Amount_000'].sum()
cnt.unstack().plot.pie(subplots=True,figsize=(20,5),
             title='Market Share by Year & State',fontsize=17,autopct ='%1.1f%%')
Out[141]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x12f492160>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x12fe02b70>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x12fe4f400>], dtype=object)
In [83]:
cnt = data.groupby('State')['Loan_Amount_000'].count()/data.shape[0]
cnt2 = data.groupby('State')['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cn = (cnt2 - cnt)*100
print(cn)
State
DC    1.604022
DE   -1.320907
MD   -1.407897
VA    2.608773
WV   -1.483991
Name: Loan_Amount_000, dtype: float64
In [97]:
cnt = data.groupby(['State','As_of_Year'])['Applicant_Income_000_num'].mean()

ax = cnt.unstack().plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Income by State & Year',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)
Out[97]:
<matplotlib.legend.Legend at 0x124c50a20>
In [99]:
cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].mean()

ax = cnt.unstack().plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Loan Amount by State & Year',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)
Out[99]:
<matplotlib.legend.Legend at 0x124cb1dd8>
In [100]:
cnt = data.groupby(['State','As_of_Year'])['Applicant_Income_000_num'].mean()

ax = cnt.unstack().plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Income by State & Year',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)
Out[100]:
<matplotlib.legend.Legend at 0x124e908d0>
In [90]:
cnt = data.groupby(['State'])['Loan_Amount_000',
            'Applicant_Income_000_num'].mean()

ax = cnt.plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Income and Loan amount by State ',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)
Out[90]:
<matplotlib.legend.Legend at 0x115647710>
In [145]:
cnt =data.groupby(['As_of_Year'])['Loan_Amount_000']
print(cnt.sum(),cnt.count())
As_of_Year
2012    159886593
2013    133380028
2014     90177176
Name: Loan_Amount_000, dtype: int64 As_of_Year
2012    557888
2013    472957
2014    290313
Name: Loan_Amount_000, dtype: int64
In [143]:
data.groupby(['State','As_of_Year'])['Loan_Amount_000'].sum().unstack().plot(
    title='Loan Amount by State and Year',
    kind='bar', 
    figsize=(10,5))
Out[143]:
<matplotlib.axes._subplots.AxesSubplot at 0x130b29048>
In [149]:
data.groupby(['Conventional_Conforming_Flag','As_of_Year'])['Loan_Amount_000'].sum().unstack().plot(
    title='Loan Amount for Conventional & Conforming Loans by Year',
    kind='bar', 
    figsize=(10,5))
Out[149]:
<matplotlib.axes._subplots.AxesSubplot at 0x12ef53400>
In [311]:
data.groupby(['State','As_of_Year'])['FFIEC_Median_Family_Income_num'].mean().unstack().plot(
    title='FFIEC Average Income by State and Year',
    kind='bar', 
    figsize=(10,5))
Out[311]:
<matplotlib.axes._subplots.AxesSubplot at 0x1189b20b8>
In [167]:
cnt = data.groupby(['State','Conventional_Conforming_Flag'])['Loan_Amount_000'].count() 
cnt = cnt / loan_data.shape[0]
print(cnt.round(2))
ax = cnt.unstack().plot(kind='bar',figsize=(10,5),fontsize=13)
ax.set_title('Market distribution by loan',fontsize=15)
ax.set_ylabel("% of number of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=12)
State  Conventional_Conforming_Flag
DC     N                               0.01
       Y                               0.03
DE     N                               0.01
       Y                               0.03
MD     N                               0.11
       Y                               0.23
VA     N                               0.18
       Y                               0.33
WV     N                               0.02
       Y                               0.04
Name: Loan_Amount_000, dtype: float64
Out[167]:
<matplotlib.text.Text at 0x13bda8710>
In [166]:
cnt = data.groupby(['State','Conventional_Conforming_Flag'])['Loan_Amount_000'].sum() 
cnt = cnt / loan_data.Loan_Amount_000.sum()
print(cnt.round(2))
ax = cnt.unstack().plot(kind='bar',figsize=(10,5),fontsize=13)
ax.set_title('Market distribution by Amount',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=14)
ax.set_xlabel("State by CC Flag",fontsize=14)
State  Conventional_Conforming_Flag
DC     N                               0.02
       Y                               0.03
DE     N                               0.01
       Y                               0.02
MD     N                               0.13
       Y                               0.20
VA     N                               0.26
       Y                               0.29
WV     N                               0.02
       Y                               0.02
Name: Loan_Amount_000, dtype: float64
Out[166]:
<matplotlib.text.Text at 0x122343128>
In [38]:
# correlation plot using heatmap
cor = data.corr()
mask = np.zeros_like(cor, dtype=np.bool) # hiding upper triangle
mask[np.triu_indices_from(mask,k=1)] = True
f, ax = plt.subplots(figsize=(8, 8))
cmap = sns.diverging_palette(15, 150,center='light', as_cmap=True)
sns.heatmap(cor, mask=mask, cmap=cmap, vmax=.3,square=True, linewidths=1.15, cbar_kws={"shrink": .65}, ax=ax,)
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x1494dc278>
In [ ]:
scatter_matrix(loan_data[x_attr], alpha=0.2, figsize=(20, 10), diagonal='kde')
In [39]:
data.groupby(['State',
                   'Conventional_Conforming_Flag'])['Loan_Amount_000',
            'Applicant_Income_000_num',
            'FFIEC_Median_Family_Income_num'].mean().round(2).style.format("{:2}")
Out[39]:
Loan_Amount_000 Applicant_Income_000_num FFIEC_Median_Family_Income_num
State Conventional_Conforming_Flag
DC N 649.07 280.68 106130.97
DC Y 335.66 147.37 106077.82
DE N 231.7 87.33 72796.18
DE Y 197.09 94.98 73999.56
MD N 335.98 133.95 94115.66
MD Y 250.67 118.67 95078.94
VA N 411.03 121.03 84763.92
VA Y 247.68 114.85 87178.69
WV N 427.01 76.02 59399.34
WV Y 134.93 87.12 56462.9
In [60]:
data.groupby(['Conventional_Conforming_Flag','As_of_Year'])['Loan_Amount_000',
            'Applicant_Income_000_num'].mean().unstack().plot(title='Income and Loan Amount Conventional and Conforming',kind='bar',figsize=(10,5))
Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x118dc7780>

The average laon amount us increasing while the income of the target market has been declining over the past three years. This is potentially incresing stress on housholds payment capacity. In addition the number of

In [99]:
data.groupby(['Conventional_Conforming_Flag','As_of_Year'])['Loan_Amount_000'].count().unstack().plot(kind='bar',
                                                                                                      figsize=(10,5))
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x127dae080>

Clustering

In [157]:
def plot_PCA_by4(DTtrans,y_pred,c1=1,c2=2,c3=3,c4=4):
    ''' Plot the principal componets in a 2x2 column figure where the first row corresponds to the first 
         principal component vs the second and third components, and the second row is the second component vs
         the third and fourth component by default. 
         
         Parameters
         ----------
         DTtrans: Numpy array with numeric values. 
         
         y_pred: Cluster assigned to each instance in DTtrans
         
         c1: Principal component to plot in y-axis/row 1. Default to 1.
         
         c2: Principal component to plot in y-axis/row 2 and first comparison of c1. Default to 2.
         
         c3: Principal component for comparison against c1 (row 1, column 2) and c2 (row 2 column 1). Default to 3.
         
         c4: Principal component for comparison against c2. Default to 4.
         '''
    
    plt.figure(figsize=(12, 12))
    plt.subplot(221)
    plt.scatter(DTtrans[:, c1], DTtrans[:, c2], c= y_pred)
    plt.ylabel('PC1')
    plt.xlabel('PC2') 
    plt.title('KMeans Clustering')
    plt.subplot(222)
    plt.scatter(DTtrans[:, c1], DTtrans[:, c3], c= y_pred)
    plt.xlabel('PC3') 
    plt.title('KMeans Clustering')
    plt.subplot(223)
    plt.scatter(DTtrans[:, c2], DTtrans[:, c3], c= y_pred)
    plt.ylabel('PC2')
    plt.xlabel('PC3') 
    plt.subplot(224)
    plt.scatter(DTtrans[:, c2], DTtrans[:, c4], c= y_pred)
    plt.xlabel('PC4')
    plt.show()
In [245]:
def Kmean_cluster(data,n_clusters=8, plot=True,n_components=None,normalize='zscore'):
    '''
    Performs nsupervized clustering using Kmeans and PCA for dimentionality reduction. 
    Returns the both Kmeans and PCA models, the data reduced by PCA and classifications
    
    Parameters:
    -----------
    data: Panda dataframe with the merge data
    
    n_clusters: number of clusters to create, deaulted to skleanr.Kmeans value of 8, 
    
    plot: Bolean, To create cluster plot. Default to True
    
    n_components: Number of component to be created by PCA. Default to the number of attibutes pass in data
    
    normalize: String with normalize function to apply 'zscore' or 'minmax'. Default == 'zscore'
    
    '''
    
    #execute minmax normalization
    minmax = lambda x: ((x-x.min())/(x.max()-x.min()) * (100-0) + 0) #if x.dtypes in [np.float64,no.int64]  else x
    zscore = lambda x: (x - x.mean()) / x.std()
    if normalize == 'zscore':
        data_norm = data[x_attr].apply(zscore) # applying function
    else:
        data_norm = data[x_attr].apply(minmax) # applying function
    # remove null data
    data_norm.dropna(inplace=True)

    colnames= list(data_norm.columns)
    # PCA  
    if n_components == None:
        n_components = len(colnames)
    pca = decomposition.PCA(n_components=n_components)
    # reducing dimensionality 
    DTtrans = pca.fit(data_norm).transform(data_norm)    
    clusters = KMeans(n_clusters=n_clusters, random_state=33).fit_predict(DTtrans)
    kmeans = KMeans()
    kmeans.fit_predict(DTtrans)

    # variance captured by component
    print ('Variance ratio provided by component: \n', pca.explained_variance_ratio_ , '\n')

    # we need to take the first 12 component to get 95% of the variance
    print ('Percentage Variance Captured with:')
    for i in range(1,len(colnames)+1):
        print ('\t %i components %0.1f' % (i,100*sum(pca.explained_variance_ratio_[:i])))


    plt.figure(1, figsize=(7, 5))
    plt.clf()
    plt.plot(pca.explained_variance_, linewidth=3)
    plt.xlabel('Number of components')
    plt.ylabel('Explained variance')
    
        
    print('\nTotal number of clusters created: %i' %(len(kmeans.cluster_centers_)))
    
    if plot:
        plot_PCA_by4(DTtrans,y_pred)
    
    return kmeans, DTtrans, pca, clusters
    
        
In [246]:
kmeans, DTtrans, pca, clusters = Kmean_cluster(data,n_clusters=20, plot=True,n_components=None,normalize='minmax')
Variance ratio provided by component: 
 [  2.29197680e-01   1.41614275e-01   1.26784421e-01   9.48884736e-02
   6.39929819e-02   5.61047528e-02   5.25916432e-02   4.42485735e-02
   3.90827050e-02   3.78888250e-02   3.09395469e-02   2.48643974e-02
   1.69151868e-02   1.08053066e-02   9.00253193e-03   7.91993740e-03
   4.53186992e-03   4.36631101e-03   1.96285720e-03   1.19444429e-03
   1.03625329e-03   6.64267680e-05   5.99382881e-07] 

Percentage Variance Captured with:
	 1 components 22.9
	 2 components 37.1
	 3 components 49.8
	 4 components 59.2
	 5 components 65.6
	 6 components 71.3
	 7 components 76.5
	 8 components 80.9
	 9 components 84.9
	 10 components 88.6
	 11 components 91.7
	 12 components 94.2
	 13 components 95.9
	 14 components 97.0
	 15 components 97.9
	 16 components 98.7
	 17 components 99.1
	 18 components 99.6
	 19 components 99.8
	 20 components 99.9
	 21 components 100.0
	 22 components 100.0
	 23 components 100.0

Total number of clusters created: 8
In [249]:
# minmax normalization 
centroids = pan.DataFrame(kmeans.cluster_centers_, columns=x_attr)
name = list(centroids.columns)

for k in range(len(pca.components_)):
    print('PC %i:'%(k+1))
    for i in range(len(pca.components_[0])):
        if abs(pca.components_[k][i])>=.35:
            print('\t%.2f \t %s'%(pca.components_[k][i],name[i]))
PC 1:
	-0.44 	 Loan_Purpose_Description_cat
	0.37 	 Loan_Type_Description_cat
	0.53 	 Conventional_Status_cat
	-0.56 	 Conventional_Conforming_Flag_cat
PC 2:
	-0.39 	 Census_Tract_Number_num
	-0.46 	 FFIEC_Median_Family_Income_num
	-0.71 	 Conforming_Limit_000
PC 3:
	-0.71 	 MSA_MD_num
	-0.51 	 State_Code
PC 4:
	-0.48 	 As_of_Year
	0.70 	 Loan_Purpose_Description_cat
PC 5:
	0.42 	 Agency_Code
	0.77 	 As_of_Year
PC 6:
	-0.57 	 Agency_Code
	0.38 	 As_of_Year
	0.38 	 Loan_Purpose_Description_cat
PC 7:
	-0.38 	 Census_Tract_Number_num
	0.36 	 MSA_MD_num
	-0.43 	 Agency_Code
	-0.46 	 State_Code
	-0.36 	 State_cat
	-0.36 	 Respondent_Name_id
PC 8:
	0.44 	 Agency_Code
	-0.82 	 Respondent_Name_id
PC 9:
	0.87 	 Respondent_ID_num
	-0.37 	 County_Name_cat
PC 10:
	-0.67 	 County_Name_cat
	0.39 	 Conforming_Status_cat
PC 11:
	0.40 	 County_Name_cat
	0.66 	 Conforming_Status_cat
PC 12:
	-0.70 	 County_Code_num
	0.36 	 MSA_MD_num
PC 13:
	0.57 	 Census_Tract_Number_num
	0.50 	 County_Code_num
	-0.39 	 County_Name_cat
PC 14:
	0.86 	 Number_of_Owner_Occupied_Units_num
PC 15:
	1.00 	 Lien_Status_Description_cat
PC 16:
	0.80 	 Loan_Type_Description_cat
PC 17:
	0.79 	 Tract_to_MSA_MD_Income_Pct_num
	-0.49 	 Sequence_Number
PC 18:
	0.48 	 Tract_to_MSA_MD_Income_Pct_num
	0.84 	 Sequence_Number
PC 19:
	0.82 	 FFIEC_Median_Family_Income_num
	-0.48 	 Conforming_Limit_000
PC 20:
	0.63 	 Conventional_Status_cat
	0.50 	 Conforming_Status_cat
	0.58 	 Conventional_Conforming_Flag_cat
PC 21:
	0.54 	 State_Code
	-0.79 	 State_cat
PC 22:
	1.00 	 Applicant_Income_000_num
PC 23:
	1.00 	 Loan_Amount_000
In [255]:
centroids.describe()
Out[255]:
Applicant_Income_000_num Census_Tract_Number_num County_Code_num FFIEC_Median_Family_Income_num MSA_MD_num Number_of_Owner_Occupied_Units_num Respondent_ID_num Tract_to_MSA_MD_Income_Pct_num Agency_Code Loan_Amount_000 ... Conforming_Limit_000 Loan_Purpose_Description_cat Lien_Status_Description_cat Loan_Type_Description_cat State_cat County_Name_cat Conventional_Status_cat Conforming_Status_cat Conventional_Conforming_Flag_cat Respondent_Name_id
count 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 ... 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000e+00
mean 0.160737 -0.040453 0.001255 -0.030728 0.031083 -0.048491 0.056338 0.016423 -0.003016 0.058433 ... -0.001506 0.000889 0.002614 -0.000820 -0.001232 -0.000222 -0.005759 0.002119 0.000021 4.005864e-07
std 0.730092 0.437840 0.365334 0.322724 0.143000 0.185789 0.265183 0.108464 0.046422 0.241734 ... 0.016453 0.003945 0.021132 0.007422 0.004613 0.009935 0.024593 0.011612 0.000132 6.659804e-05
min -0.481971 -0.465601 -0.440805 -0.751235 -0.173369 -0.471619 -0.165860 -0.117522 -0.100628 -0.069875 ... -0.022395 -0.005270 -0.030478 -0.016343 -0.011938 -0.020910 -0.064537 -0.006544 -0.000133 -9.818891e-05
25% -0.392693 -0.352628 -0.294260 -0.061338 -0.040808 -0.068908 -0.063417 -0.021982 -0.014310 -0.049660 ... -0.014989 -0.001877 -0.011365 -0.001433 -0.001212 -0.002998 -0.001758 -0.002534 -0.000084 -3.711344e-05
50% -0.119933 -0.159367 -0.002480 0.025378 0.001003 0.012535 -0.029423 -0.009517 0.010222 -0.019631 ... -0.002052 0.001486 0.006476 0.001206 -0.000055 0.000803 0.001164 -0.000823 -0.000008 5.604213e-06
75% 0.544461 0.148931 0.189173 0.126086 0.080148 0.054491 0.064524 0.026842 0.024108 0.010983 ... 0.011420 0.003172 0.020683 0.002828 0.000809 0.005190 0.003648 0.000736 0.000104 5.414455e-05
max 1.336218 0.718242 0.655399 0.298091 0.284404 0.103165 0.676169 0.257533 0.043260 0.651985 ... 0.023376 0.006332 0.025909 0.007247 0.002702 0.011857 0.015897 0.030091 0.000224 7.892220e-05

8 rows × 23 columns

In [248]:
datos_mm =[]
for col in data.columns:
    if data[col].dtype in [int,float]:
        temp =[]
        temp.append(col)
        temp.append(data[col].mean())
        temp.append(data[col].std())
        datos_mm.append(temp)
datos_mm = pan.DataFrame(datos_mm,columns=['attr','min','max'])
datos_mm.to_csv('Centroids minmax vals.csv',index=False)
In [219]:
kmeans, DTtrans, pca, clusters = Kmean_cluster(data,n_clusters=20, plot=True,n_components=None,normalize='zscore')
Variance ratio provided by component: 
 [ 0.13615468  0.11167448  0.09384719  0.08306762  0.08202813  0.06082295
  0.05669165  0.04817923  0.04580173  0.04414429  0.04251639  0.03723629
  0.03116526  0.03006547  0.02817814  0.02333021  0.02261443  0.01254846
  0.00546148  0.00200448  0.00124173  0.00066991  0.00055579] 

Percentage Variance Captured with:
	 1 components 13.6
	 2 components 24.8
	 3 components 34.2
	 4 components 42.5
	 5 components 50.7
	 6 components 56.8
	 7 components 62.4
	 8 components 67.2
	 9 components 71.8
	 10 components 76.2
	 11 components 80.5
	 12 components 84.2
	 13 components 87.3
	 14 components 90.3
	 15 components 93.2
	 16 components 95.5
	 17 components 97.8
	 18 components 99.0
	 19 components 99.6
	 20 components 99.8
	 21 components 99.9
	 22 components 99.9
	 23 components 100.0

Total number of clusters created: 8
In [239]:
# zscore normalization 
centroids = pan.DataFrame(kmeans.cluster_centers_, columns=x_attr)
name = list(centroids.columns)

for k in range(len(pca.components_)):
    print('PC %i:'%(k+1))
    for i in range(len(pca.components_[0])):
        if abs(pca.components_[k][i])>=.35:
            print('\t%.2f \t %s'%(pca.components_[k][i],name[i]))
PC 1:
	-0.35 	 County_Code_num
	-0.37 	 State_Code
	-0.37 	 State_cat
PC 2:
	0.37 	 Loan_Type_Description_cat
	0.40 	 Conventional_Status_cat
	-0.47 	 Conventional_Conforming_Flag_cat
PC 3:
	0.44 	 MSA_MD_num
	0.39 	 State_Code
	0.41 	 Conforming_Limit_000
PC 4:
	0.43 	 Applicant_Income_000_num
	0.45 	 Tract_to_MSA_MD_Income_Pct_num
	0.46 	 Conforming_Status_cat
PC 5:
	-0.46 	 Agency_Code
	-0.55 	 Sequence_Number
PC 6:
	-0.35 	 Census_Tract_Number_num
	-0.42 	 Number_of_Owner_Occupied_Units_num
	0.49 	 Lien_Status_Description_cat
	-0.36 	 State_cat
PC 7:
	0.78 	 Lien_Status_Description_cat
PC 8:
	-0.56 	 Respondent_ID_num
	-0.60 	 As_of_Year
	-0.37 	 Respondent_Name_id
PC 9:
	-0.36 	 Respondent_ID_num
	-0.60 	 Agency_Code
	0.53 	 Respondent_Name_id
PC 10:
	-0.40 	 Number_of_Owner_Occupied_Units_num
	-0.53 	 County_Name_cat
PC 11:
	-0.38 	 Number_of_Owner_Occupied_Units_num
	0.53 	 Respondent_ID_num
	-0.50 	 As_of_Year
PC 12:
	0.41 	 Census_Tract_Number_num
	-0.46 	 MSA_MD_num
	0.55 	 County_Name_cat
PC 13:
	0.63 	 Applicant_Income_000_num
	-0.50 	 Conforming_Status_cat
PC 14:
	-0.71 	 Loan_Purpose_Description_cat
	-0.39 	 Conforming_Status_cat
PC 15:
	0.35 	 Number_of_Owner_Occupied_Units_num
	-0.62 	 Tract_to_MSA_MD_Income_Pct_num
PC 16:
	-0.76 	 County_Code_num
	-0.41 	 Number_of_Owner_Occupied_Units_num
PC 17:
	-0.43 	 Agency_Code
	0.66 	 Sequence_Number
	-0.44 	 Respondent_Name_id
PC 18:
	0.64 	 Census_Tract_Number_num
	0.51 	 MSA_MD_num
	-0.37 	 Conforming_Limit_000
PC 19:
	0.79 	 Loan_Type_Description_cat
	-0.37 	 Conventional_Status_cat
	0.45 	 Conventional_Conforming_Flag_cat
PC 20:
	0.59 	 FFIEC_Median_Family_Income_num
	-0.42 	 State_Code
	-0.46 	 Conforming_Limit_000
	0.48 	 State_cat
PC 21:
	0.43 	 FFIEC_Median_Family_Income_num
	0.58 	 State_Code
	-0.48 	 Conforming_Limit_000
	-0.45 	 State_cat
PC 22:
	0.50 	 Loan_Amount_000
	-0.60 	 Conventional_Status_cat
	-0.56 	 Conventional_Conforming_Flag_cat
PC 23:
	0.85 	 Loan_Amount_000
	0.38 	 Conventional_Status_cat
In [241]:
datos_mm =[]
for col in data.columns:
    if data[col].dtype in [int,float]:
        temp =[]
        temp.append(col)
        temp.append(data[col].mean())
        temp.append(data[col].std())
        datos_mm.append(temp)
datos_mm = pan.DataFrame(datos_mm,columns=['attr','mean','std'])
In [207]:
data.loc[data.Respondent_Name =='WELLS FARGO BANK']
Out[207]:
Agency_Code Applicant_Income_000 As_of_Year Census_Tract_Number County_Code FFIEC_Median_Family_Income Loan_Amount_000 MSA_MD Number_of_Owner_Occupied_Units Respondent_ID ... Respondent_City_TS Respondent_State_TS Respondent_ZIP_Code Parent_Name_TS Parent_City_TS Parent_State_TS Parent_ZIP_Code Assets_000_Panel Respondent_Name Respondent_Name_id
20724 9 0221 2012 0204.00 079 00054900 409 16620 00002300 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20725 9 0125 2012 0206.04 079 00054900 260 16620 00000715 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20726 9 0032 2012 0103.00 039 00054900 47 16620 00000736 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20727 9 0185 2012 0204.00 079 00054900 237 16620 00002300 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20728 9 0086 2012 0011.00 039 00054900 204 16620 00001582 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20729 9 0127 2012 0202.00 079 00054900 219 16620 00001738 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20730 9 0117 2012 0019.01 039 00054900 200 16620 00001396 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20731 9 0120 2012 0206.03 079 00054900 282 16620 00002198 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20732 9 0133 2012 0123.00 039 00054900 333 16620 00001805 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20733 9 0090 2012 0105.00 039 00054900 176 16620 00001483 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20734 9 0084 2012 0206.05 079 00054900 229 16620 00002493 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20735 9 0069 2012 0019.02 039 00054900 240 16620 00001552 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20736 9 0180 2012 9585.02 005 00054900 273 16620 00001177 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20737 9 0047 2012 0133.00 039 00054900 180 16620 00000915 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20738 9 0095 2012 0105.00 039 00054900 160 16620 00001483 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20739 9 0121 2012 0132.00 039 00054900 409 16620 00001138 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20740 9 0125 2012 0206.05 079 00054900 223 16620 00002493 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20741 9 0085 2012 0113.02 039 00054900 125 16620 00001686 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20742 9 0296 2012 0206.01 079 00054900 155 16620 00001935 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20743 9 0101 2012 0203.00 079 00054900 154 16620 00001381 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20744 9 0101 2012 0206.05 079 00054900 311 16620 00002493 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20745 9 0110 2012 0105.00 039 00054900 142 16620 00001483 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20746 9 0200 2012 0204.00 079 00054900 272 16620 00002300 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20747 9 0040 2012 0011.00 039 00054900 173 16620 00001582 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20748 9 0102 2012 0106.00 039 00054900 63 16620 00001511 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20749 9 0380 2012 0018.00 039 00054900 360 16620 00000782 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20750 9 0150 2012 0123.00 039 00054900 200 16620 00001805 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20751 9 0148 2012 0204.00 079 00054900 178 16620 00002300 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20752 9 0065 2012 0207.00 079 00054900 166 16620 00001545 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
20753 9 0529 2012 0207.00 079 00054900 107 16620 00001545 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1161490000 WELLS FARGO BANK 1308
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
25838 9 0130 2014 0101.00 027 00052000 235 NA 00001503 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25839 9 0046 2014 0501.00 197 00052000 64 NA 00001528 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25840 9 0233 2014 0403.00 099 00052000 258 NA 00002274 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25841 9 0119 2014 0203.00 133 00052000 260 NA 00002135 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25842 9 0084 2014 0405.00 099 00052000 323 NA 00000792 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25843 9 0015 2014 0203.00 133 00052000 58 NA 00002135 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25844 9 0037 2014 9301.00 163 00052000 125 NA 00002079 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25845 9 0091 2014 9302.00 163 00052000 172 NA 00001559 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25846 9 0033 2014 9312.00 195 00052000 84 NA 00001587 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25847 9 0020 2014 0701.01 640 00052000 31 NA 00001415 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25848 9 0060 2014 0001.00 590 00052000 69 NA 00001841 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25849 9 NA 2014 9502.01 109 00052000 55 NA 00001238 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25850 9 0080 2014 9505.00 109 00052000 152 NA 00002168 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25851 9 0039 2014 0403.00 171 00052000 119 NA 00001271 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25852 9 0062 2014 9312.00 195 00052000 149 NA 00001587 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25853 9 0106 2014 1101.02 137 00052000 43 NA 00002696 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25854 9 0052 2014 1101.03 137 00052000 140 NA 00003393 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25855 9 0044 2014 2001.00 175 00052000 120 NA 00001366 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25856 9 0051 2014 0803.00 035 00052000 170 NA 00001739 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25857 9 0051 2014 1103.00 137 00052000 176 NA 00001958 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25858 9 0092 2014 8801.01 081 00052000 134 NA 00001282 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25859 9 0107 2014 0406.00 171 00052000 170 NA 00000965 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25860 9 0058 2014 9511.00 119 00052000 154 NA 00000864 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25861 9 0039 2014 0407.00 171 00052000 200 NA 00001076 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25862 9 0044 2014 0401.00 171 00052000 133 NA 00002112 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25863 9 0113 2014 2004.00 175 00052000 243 NA 00001686 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25864 9 0057 2014 0014.00 590 00052000 124 NA 00001073 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25865 9 0078 2014 0106.02 089 00052000 138 NA 00001199 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25866 9 0030 2014 0110.01 143 00052000 61 NA 00001347 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308
25867 9 0292 2014 0401.00 099 00052000 91 NA 00001020 0000451965 ... WINSTON-SALEM NC 27150 WELLS FARGO & CO SAN FRANCISCO CA 94104-1207 1373600000 WELLS FARGO BANK 1308

137060 rows × 53 columns

In [288]:
a = data.MSA_MD_num.unique()
a.sort()
a
Out[288]:
array([ 12580.,  13220.,  13644.,  13980.,  15680.,  16620.,  16820.,
        19060.,  19260.,  20100.,  25180.,  25500.,  26580.,  28700.,
        31340.,  34060.,  37620.,  40060.,  40220.,  41540.,  43524.,
        44420.,  44600.,  47260.,  47894.,  48260.,  48540.,  48864.,
        49020.,     nan])
In [298]:
for i in a:
    if np.isnan(i) == False:
        print(data.loc[data.MSA_MD_num == i].State.unique())
['MD']
['WV']
['MD']
['VA']
['MD']
['WV']
['VA']
['WV' 'MD']
['VA']
['DE']
['WV' 'MD']
['VA']
['WV']
['VA']
['VA']
['WV']
['WV']
['VA']
['VA']
['MD' 'DE']
['MD']
['VA']
['WV']
['VA']
['DC' 'MD' 'VA' 'WV']
['WV']
['WV']
['DE' 'MD']
['VA' 'WV']