Julio Sotelo | Capital One Data Challenge¶

import itertools, collections # ad increments to a index of loop
import pandas as pan
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.tools.plotting import scatter_matrix
from sklearn.tree import DecisionTreeRegressor # bining
from sklearn import preprocessing # Categorical attr to numeric
from sklearn.cluster import KMeans
from sklearn import decomposition
#plt.style.use('ggplot')
plt.style.use('fivethirtyeight')
%matplotlib inline

Loading institutions data¶

def load_inst_data():
    """ Loads 2012_to_2014_institutions_data file. Adding a new field with the unified
    name of the respondant institution. In addition it creates a dictionary
    with the unique id and the name given to each institution
    
    """

    ' institutions data'
    inst_data = pan.read_csv('2012_to_2014_institutions_data.csv',low_memory=False,na_values=['',' ',])

    # setting Id for institution and year
    inst_data['inst_id_yr'] = inst_data.apply(lambda row: str(row['Agency_Code'])+'-'+str(row['Respondent_ID']),axis=1)    
    
    # unique responders    
    inst_ids =  list(inst_data.inst_id_yr.unique())
    # names given
    inst_idn = inst_data[['inst_id_yr','Respondent_Name_TS']]
    
    inst_id_names = {}
    for i in inst_ids:
        names = inst_idn.loc[inst_idn.inst_id_yr == i]    
        inst_id_names[i] = list(names.Respondent_Name_TS.unique())
    
    # stop words to delete from the institution name
    stop_words = ['INC','CO','CIA','LLC','CORP','NA','FCU','CFCU']
    
    #dictionary of institutions names and list of all known names
    inst_name = {} # key is respondent id, value name given
    for inst in inst_id_names:
        # inst='7-34-2000096'
        # select the shortest name
        uname = min(inst_id_names[inst],key=len)
        # removing punctuation 
        name = uname.replace('.', '')
        name = name.replace(',', '')
        name = name.replace("'", '')
        # name to list
        temp = [x.strip().upper() for x in name.split(' ')] 
        # locating stop words index
        dtemp =[]
        for i in range(len(temp)):   
            try:
                if temp[i] in stop_words:
                    dtemp.append(i)
            except:
                continue
        # removing stop words
        for i in range(len(dtemp)): 
            temp.pop(dtemp[i])
            if i < len(dtemp)-1:
                dtemp[i+1] -=1
            
        # record names
        name = " ".join(x for x in temp)
        if inst not in inst_name:
            inst_name[inst] = name
       
    
    inst_data['Respondent_Name'] = inst_data.apply(lambda row: inst_name[row['inst_id_yr']],axis=1)
    
    
    return inst_data,inst_name

Binning function¶

# discretizing data function
# since i have outliers i will create the bins considering the spread from 2 standard deviations from de median
def Create_bins(data,variable='Loan_Amount_000',bin_name = None,quantiy_label=False,num_bins=None,
                regression=False,attributes=None,max_depth=None,min_samples_leaf=10000):
    """ Creates bins automatically using two aproaches:
    
    
        1) Bining to a given number of bins. Calculated using statistics of data within the IQR outliers 
            identification method [Q2-1.IQR,Q3+1.5IQR]. Doing so we eliminate the noise of possible outliers.
        2) Binig by regression using decision trees.
        
        Parameters
        ----------
        data: Panda Dataframe tha contains the variable and where the bin class of each instance will be set
        
        variable: column name of the data what would be discretize.
        
        bin_name: Column name where the bin will be stored in the dataframe. Default: variable_bin 
            i.ei I f variable = 'loan_amnt' then bin_name = loan_amnt_bin
        
        quantiy_label: Optional label to be added to each bin label ( %, k, $, etc).
        
        num_bins: number of bins to be created, ignore when regression is True.
        
        regression: Bolean, use decision trees to calculate bins. True / False (default).
        
        attributes: List with columns names of the data provided to by use in the regressor tree.
        
        max_depth : int or None, optional (default=int(round(len(data)**.5)))
           The maximum depth of the tree. If None, then nodes are expanded until
           all leaves are pure or until all leaves contain less than
           min_samples_split samples.:
           
        min_samples_split : int, float, optional (default=2)
          The minimum number of samples required to split an internal node
        
    """
    #data validation
    if type(variable) != str:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('variable must be a string') from e        
    if regression == True:
        # provigind a valid list of attributes
        try: 
            len(attributes)
        except TypeError as e:
            raise Exception('''Attributes must be provided as a list containing the names of the columns to be use 
                            by the decision tree''') from e
        # valid number 
        if max_depth == None:
            max_depth=int(round(len(data)**.5))
        else:
            try:
                int(max_depth)
            except ValueError as e:
                raise Exception('max_depth must be an integer') from e
        # valid number 
        try:
            int(min_samples_leaf)
        except ValueError as e:
                raise Exception('min_samples_leaf must be an integer') from e
    else: 
        # bin by fixed number
        # valid number 
        if num_bins != None:
            try:
                int(num_bins)
            except ValueError as e:
                    raise Exception('num_bins must be an integer') from e

    # bin creation
    if regression == True:
        variable= 'Loan_Amount_000'
        #x = data[data.columns.difference([variable])].copy()
        x = loan_data[attributes].copy()
        x.dropna(inplace=True)
        y = x.pop(variable)          
        DTC = DecisionTreeRegressor(max_depth=max_depth,min_samples_leaf=min_samples_leaf)
        DTC.fit(x,y) 
        thrs_out = np.unique(DTC.tree_.threshold[DTC.tree_.feature > -2] )
        bins = np.sort(thrs_out)
        ln = len(thrs_out)
        labels = []
        for i in range(1,ln):
            if i == ln:
                labels.append('+'+str(round(thrs_out[i-1],2))+str(quantiy_label if quantiy_label != None else ""))
            else:
                labels.append(str(round(thrs_out[i-1],2))+'-'+str(round(thrs_out[i],2))+
                              str(quantiy_label if quantiy_label != None else ""))
    else:
        # number of bins tobe created
        num_bins = num_bins if num_bins != None else 40
        IQR = data[variable].describe()[-2]- data[variable].describe()[-4]
        low = max(0,(data[variable].describe()[-4] - IQR*1.5))
        hig = (data[variable].describe()[-2] + IQR*1.5)
        cut = (hig - low )/ num_bins
        # starting point of bins
        bins = [0] 
        for i in range(num_bins):
            bins.append(int(bins[i]+cut))
        labels = []
        for i in range(1,num_bins+1):
            if i == num_bins:
                labels.append('+'+str(bins[i-1])+str(quantiy_label if quantiy_label != None else ""))
            else:
                labels.append(str(bins[i-1])+'-'+str(bins[i])+str(quantiy_label if quantiy_label != None else ""))
    # setting bin in each instance
    if bin_name == None:
        bin_name = variable+'_bin'
        
    data[bin_name] = pan.cut(data[variable],bins,labels=labels)
    
    return data

Loading Loan Data¶

def load_loan_data(variable_to_bin='Loan_Amount_000',bin_name = None,quantiy_label=False,num_bins=None,
                regression=False,attributes=None,max_depth=None,min_samples_leaf=10000):
    ''' Loads loan data converting to null any value like  ""  or " ". 

    Returns a panda dataframe 

     '''

    loan_data = pan.read_csv('2012_to_2014_loans_data.csv',na_values=['',' ',],low_memory=False)

    '---------------- ---------------- ---------------- ----------------'
    ' data preprocessing ' 
    loan_data.dtypes

    # converting object to numeric values
    col_to_numeric =['Applicant_Income_000',
                    'Census_Tract_Number',
                    'County_Code',
                    'FFIEC_Median_Family_Income',
                    'MSA_MD',
                    'Number_of_Owner_Occupied_Units',
                    'Respondent_ID',
                    'Tract_to_MSA_MD_Income_Pct']

    for var in col_to_numeric:
        loan_data[var+'_num'] = pan.to_numeric(loan_data[var],errors='coerce')

    # numeric attributes
    num_attr = [x for x in loan_data.columns if '_num' in x]
    num_attr.extend(['Agency_Code','Loan_Amount_000','As_of_Year','Sequence_Number',
                     'State_Code','Conforming_Limit_000'])


    #categorical attributes
    cat_attr = ['Loan_Purpose_Description',
                'Lien_Status_Description',
                'Loan_Type_Description',
                'State','County_Name',
                'Conventional_Status',
                'Conforming_Status',
                'Conventional_Conforming_Flag']

    # list with cat attributes with descriptions of a numerical attribute:
    desc_attr = ['MSA_MD_Description','Agency_Code_Description']

    # creating numerical values for all categorical attributes
    cat_attr_num = []
    for attr in cat_attr:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(loan_data[attr].values))
        loan_data[attr+'_cat'] = lbl.transform(list(loan_data[attr].values))
        cat_attr_num.append(attr+'_cat')



    x_attr = num_attr.copy()
    x_attr.extend(cat_attr_num)

    # id for loan institution
    loan_data['inst_id_yr'] = loan_data.apply(lambda row: str(row['Agency_Code'])+'-'+str(row['Respondent_ID']),axis=1)
    
    loan_data = Create_bins(loan_data,
                   variable='Loan_Amount_000' if variable_to_bin == 'Loan_Amount_000' else variable_to_bin,
                   bin_name = None if bin_name == None else bin_name,
                   quantiy_label='k' if quantiy_label == False else quantiy_label,
                   num_bins=40 if num_bins == None else num_bins ,
                   regression=False if regression == False else True,
                   attributes=x_attr if attributes == None else attributes,
                   max_depth=None if max_depth == None else max_depth,
                   min_samples_leaf=10000 if min_samples_leaf==None else min_samples_leaf)

    return loan_data, x_attr, cat_attr

Merging data & returning Panda Dataframe¶

def hmda_init(cats=False,variable_to_bin='Loan_Amount_000',bin_name = None,quantiy_label=False,num_bins=None,
                regression=False,attributes=None,max_depth=None,min_samples_leaf=10000):
    ''' Merging institutions and loan data using load_loan_data() and load_inst_data() functions.
    Both source files mus be located in the same directory where the Python instance is running. In addition, 
    both files must be named as shown bellow.
    
        Institutions file name : 2012_to_2014_institutions_data.csv
        Loans file names       : 2012_to_2014_loans_data.csv

    This functios returns the following objects: 
        - By default returns data, a panda dataframe with the loans and institutions data merged.
        - Optional to get two additional list x_attr and cat_attr, containing the list of names of numerical 
            attributes and categorical attributes respectively.
            
    Parameters:
    ===========
    cats: Bolean, when True the list x_attr and cat_attr are returned. Drfault to False.
    
    data: Panda Dataframe tha contains the variable and where the bin class of each instance will be set
        
    variable: String with column name of the data what would be discretize.

    bin_name: String with name where the bin will be stored in the dataframe. Default: variable+'_bin'
        i.ei I f variable = 'loan_amnt' then bin_name = loan_amnt_bin

    quantiy_label: Optional string label to be added to each bin label ( %, k, $, etc). Default to "k"

    num_bins: number of bins to be created, ignore when regression is True.

    regression: Bolean, use decision trees to calculate bins. True / False(default).

    attributes: List with columns names as strings of the data provided to by use in the regressor tree.

    max_depth : int or None, optional (default=int(round(len(data)**.5)))
       The maximum depth of the tree. If None, then nodes are expanded until
       all leaves are pure or until all leaves contain less than
       min_samples_split samples.:

    min_samples_split : int, float, optional (default=2)
      The minimum number of samples required to split an internal node
        
    '''
    # institutions data
    inst_data,inst_name = load_inst_data()
    # no need of the colums in merge
    inst_data.drop('Agency_Code',axis=1,inplace=True)
    inst_data.drop('Respondent_ID',axis=1,inplace=True)

    # loan data
    loan_data, x_attr, cat_attr = load_loan_data()

    # data has the datat gerge
    data = pan.DataFrame()

    # for each year set the institution data
    for year in loan_data.As_of_Year.unique():
        d = loan_data.query('As_of_Year == @year').copy()
        i = inst_data.query('As_of_Year == @year').copy()
        temp= pan.merge(left=d,right=i,how='inner',on='inst_id_yr')
        data = pan.concat([data,temp])
    
    # columns name handling
    data.rename(columns={'As_of_Year_x':'As_of_Year'},inplace=True)
    data.drop('As_of_Year_y',axis=1,inplace=True)
    
    #respondent name id
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(data['Respondent_Name'].values))
    data['Respondent_Name_id'] = lbl.transform(list(data['Respondent_Name'].values))
    x_attr.append('Respondent_Name_id')
    
    if cats:
        return data,x_attr,cat_attr
    else:
        return data

Data Json Export¶

''' Pendiente definir si states, agency y resondent va como string o como int '''


def hmda_to_json(data, states='all', conventional_conforming='all',agency='all',respondent='all'):
    """
    Create a text file witha list of json object containing the records aplying specific filters.
    
    Parameters
    ----------
    
    data: Resulting data object from hmda_init function
    
    states: Opitional integer with unique state or list of integers with states to get. Default selects all States available
    
    conventional_conforming:  Optional integer or list indicating to export both or either one of Conventional 
            and Conforming loans.
            Examples:
                conventional_conforming ='all' fo get all loans. Default.
                conventional_conforming ='Conventional' to get only conventional loan.
                conventional_conforming = 'Conforming' to get only conforming loans.
                conventional_conforming = 'Jumbo' to get only Jumbo loans.
    
    agency: Optional list with agancy codes to filter the export.
    
    respondent: Optional list with respondent ids to filter the export
    
    Example:
        
        hmda_to_json(data, states=11, conventional_conforming='all',agency=5,respondent=[5588,3601])
    
    """

    # inputs validation
    if type(data) != pan.core.frame.DataFrame:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('data must be a Pandas DataFrame') from e        
    
    if type(states) not in [list,str]:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('states  must be a string with one state or a list of states') from e        
    if type(states) ==  int: 
            states=[states]

    if type(conventional_conforming) not in [list,str]:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('staconventional_conformingtes must be a string ("Y" or "N") a list like ["Y","N"]') from e

    if type(agency) not in [list,str]:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('agency must be a string with one agency code or a list of agency names') from e
    if type(agency) ==  int: 
            agency=[agency]
        
    if type(respondent) not in [list,str]:
        try: 
            int('e') 
        except ValueError as e: 
            raise Exception('respondent must be a string with one respondent id (like "0000000384") or a list of respondent ids') from e
    if type(respondent) ==  int: 
            respondent=[respondent]
        
        
    # applying filter
    #states = [11]
    #data_f.groupby('Respondent_ID_num')['State_Code'].count()
    
    if states != 'all':
        data = data[data.State.isin(states)]
    if conventional_conforming != 'all':
        data = data[data.Conventional_Conforming_Flag.isin(conventional_conforming)]
    #agency = [5] #data.Agency_Code.unique()
    if agency != 'all':
        data = data[data.Agency_Code_Description.isin(agency)]
    # respondent = [5588,3601]
    if respondent != 'all':
        data = data[data.Respondent_ID.isin(respondent)]
    
    data_j = data.to_json(orient='records')
    
    with open('data.txt','w') as file:
        file.write(data_j)


    print('Done!')

data,x_attr,cat_attr = hmda_init(True)

data.columns

Index(['Agency_Code', 'Applicant_Income_000', 'As_of_Year',
       'Census_Tract_Number', 'County_Code', 'FFIEC_Median_Family_Income',
       'Loan_Amount_000', 'MSA_MD', 'Number_of_Owner_Occupied_Units',
       'Respondent_ID', 'Sequence_Number', 'State_Code',
       'Tract_to_MSA_MD_Income_Pct', 'MSA_MD_Description',
       'Loan_Purpose_Description', 'Agency_Code_Description',
       'Lien_Status_Description', 'Loan_Type_Description', 'State',
       'County_Name', 'Conforming_Limit_000', 'Conventional_Status',
       'Conforming_Status', 'Conventional_Conforming_Flag',
       'Applicant_Income_000_num', 'Census_Tract_Number_num',
       'County_Code_num', 'FFIEC_Median_Family_Income_num', 'MSA_MD_num',
       'Number_of_Owner_Occupied_Units_num', 'Respondent_ID_num',
       'Tract_to_MSA_MD_Income_Pct_num', 'Loan_Purpose_Description_cat',
       'Lien_Status_Description_cat', 'Loan_Type_Description_cat', 'State_cat',
       'County_Name_cat', 'Conventional_Status_cat', 'Conforming_Status_cat',
       'Conventional_Conforming_Flag_cat', 'inst_id_yr', 'Loan_Amount_000_bin',
       'Respondent_Name_TS', 'Respondent_City_TS', 'Respondent_State_TS',
       'Respondent_ZIP_Code', 'Parent_Name_TS', 'Parent_City_TS',
       'Parent_State_TS', 'Parent_ZIP_Code', 'Assets_000_Panel',
       'Respondent_Name', 'Respondent_Name_id'],
      dtype='object')

Loan Data - Data Review¶

data[x_attr].describe().round(2)

/Users/Julio/anaconda/lib/python3.5/site-packages/numpy/lib/function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)

data[cat_attr].describe(include='all')

Quality Check¶

a - Loan Amount¶

data.Loan_Amount_000.describe().round(2)

count    1321158.00
mean         290.23
std          965.78
min            1.00
25%          153.00
50%          235.00
75%          347.00
max        99625.00
Name: Loan_Amount_000, dtype: float64

dd = data.loc[(data.County_Name == "DISTRICT OF COL") & (data.State == 'DC' )]
dd.shape
sns.distplot(dd.Loan_Amount_000)

/Users/Julio/anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

<matplotlib.axes._subplots.AxesSubplot at 0x2e3b4d160>

cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].mean()/data.Applicant_Income_000_num.mean()
ax = cnt.unstack().plot(kind='bar',figsize=(10,5))
ax.set_title("Borrower's indebtedness by State",fontsize=18)
ax.set_ylabel("Percentage",fontsize=15)
ax.set_xlabel("State",fontsize=15)
ax.legend(fontsize=12)

<matplotlib.legend.Legend at 0x17e00e5f8>

cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].mean()/data.Loan_Amount_000.mean()
ax = cnt.unstack().plot(kind='bar',figsize=(10,5))
ax.set_title("Loan amount Deviation form average",fontsize=18)
ax.set_ylabel("Percentage",fontsize=15)
ax.set_xlabel("State",fontsize=15)
ax.legend(fontsize=12)

data.Loan_Amount_000.plot.hist(logy=True,figsize=(10,6))

<matplotlib.axes._subplots.AxesSubplot at 0x116864358>

The logarithmic distribution of the loan amont is power law

data.plot.scatter(x='State_Code', y='Loan_Amount_000',figsize=(10,6))

<matplotlib.axes._subplots.AxesSubplot at 0x11474e2b0>

data.plot.scatter(title='Loan amount by county',x='County_Code_num', y='Loan_Amount_000',figsize=(10,6))

<matplotlib.axes._subplots.AxesSubplot at 0x2c5e7f128>

There are some outliers in the data that need to be addressed. Since we have a vast amount of data we could replace the value with the mean of the similar values in terms of region, institution, and income of the borrower. On the other hand, I need to see if the data points are within Change Financial's region of interest.

data.plot.scatter(x='Conventional_Conforming_Flag_cat', y='Loan_Amount_000',figsize=(10,6))

<matplotlib.axes._subplots.AxesSubplot at 0x141788400>

No is zero, Yes is 1

outcome = pan.crosstab(data['Loan_Amount_000'], data['Conventional_Conforming_Flag'])
plt.show(outcome.plot(kind="bar", title = 'Loan Amount by C&C',figsize=(20,10)))

fig, ax = plt.subplots(figsize=(10, 6), ncols=1)
color_pallet= {0:'blue', 1:'green'}
ax.scatter(data['Applicant_Income_000_num'], 
           data['Loan_Amount_000'], 
            c=data['Conventional_Conforming_Flag_cat'].apply(lambda x: color_pallet[x]))
plt.show()

a = data.groupby('Loan_Amount_000_bin')['Loan_Amount_000_bin'].count()
#print('Numer of instances per bin: \n',a,'\n\n', '% of instances per bin: \n',
#    a/loan_data.Loan_Amount_000.count(),end='\n')
print('% of instances per bin: \n',a/data.Loan_Amount_000.count())
data.groupby('Loan_Amount_000_bin')['Loan_Amount_000_bin'].count().plot.bar(title='Loan amount distribution', figsize=(10,6))

% of instances per bin: 
 Loan_Amount_000_bin
0-15k       0.005768
15-30k      0.004699
30-45k      0.008734
45-60k      0.016361
60-75k      0.021453
75-90k      0.027160
90-105k     0.032468
105-120k    0.036326
120-135k    0.042062
135-150k    0.046709
150-165k    0.046907
165-180k    0.048634
180-195k    0.045571
195-210k    0.047084
210-225k    0.044677
225-240k    0.043001
240-255k    0.041436
255-270k    0.037519
270-285k    0.035386
285-300k    0.034377
300-315k    0.028371
315-330k    0.027430
330-345k    0.025142
345-360k    0.025046
360-375k    0.021788
375-390k    0.022448
390-405k    0.025735
405-420k    0.034014
420-435k    0.006277
435-450k    0.007967
450-465k    0.007627
465-480k    0.008243
480-495k    0.009088
495-510k    0.007244
510-525k    0.006326
525-540k    0.006169
540-555k    0.005393
555-570k    0.005341
570-585k    0.004746
+585k       0.005318
Name: Loan_Amount_000_bin, dtype: float64

<matplotlib.axes._subplots.AxesSubplot at 0x125b5e860>

data.groupby(['State','County_Name'])['Loan_Amount_000'].describe()

State  County_Name           
DC     DISTRICT OF COL  count    49451.000000
                        mean       414.596125
                        std        285.096230
                        min          4.000000
                        25%        248.000000
                        50%        364.000000
                        75%        520.000000
                        max       8000.000000
DE     KENT             count    11087.000000
                        mean       188.254623
                        std         78.626215
                        min          1.000000
                        25%        135.000000
                        50%        183.000000
                        75%        235.000000
                        max       1760.000000
       NEW CASTLE       count    38038.000000
                        mean       212.860797
                        std        120.023385
                        min          1.000000
                        25%        142.000000
                        50%        196.000000
                        75%        264.000000
                        max       4000.000000
       SUSSEX           count    12468.000000
                        mean       211.522538
                        std        120.788651
                        min          4.000000
                        25%        137.000000
                        50%        190.000000
                                     ...     
WV     WETZEL           std       4407.141259
                        min         11.000000
                        25%         60.000000
                        50%         91.000000
                        75%        129.750000
                        max      77637.000000
       WIRT             count      119.000000
                        mean       103.630252
                        std         56.208731
                        min         17.000000
                        25%         63.000000
                        50%         92.000000
                        75%        133.500000
                        max        320.000000
       WOOD             count     4424.000000
                        mean       187.750452
                        std       1971.687711
                        min          5.000000
                        25%         70.000000
                        50%        102.000000
                        75%        150.000000
                        max      92355.000000
       WYOMING          count      224.000000
                        mean       485.558036
                        std       6005.467877
                        min         13.000000
                        25%         50.000000
                        50%         73.500000
                        75%        102.750000
                        max      89963.000000
Name: Loan_Amount_000, dtype: float64

For upcoming years we can create validations to identify outliers or instances that might be errors. Those validations will be based on previous data.

Validations to apply:
    - Trends: are the values in the same trend as years before? The loan amount is moving accordingly.
    - Outliers: house prices move within ranges, verify that new values are within expectation. Otherwise, data may be erroneous or an outlier.
    - Comparing loan amount to income we get the a ratio of endebtness fo the borrower. Incorporing the Rate spread we may also stimate the installment and get the Debt to Income ratio DTI.

 All validations may be filtered by institudion too.

Respondent Name Review¶

inst_data,inst_names,inst_name = load_inst_data()
len(inst_data.Respondent_Name.unique())

8269

# num de repodent is unicos
len(inst_data.Respondent_ID.unique())

7914

maxl = 0
ina = ''
for i in inst_name:
    if len(inst_name[i]) > maxl:
        maxl = len(inst_name[i])
        ina = i
print(maxl,inst_name[ina])

5 ['AMERICAN BANK, NA', 'AMERICAN BANK N.A.', 'AMERICAN BANK', 'AMERICAN BANK NA', 'AMERICAN BANK, N.A.']

inst_data.loc[inst_data.Respondent_ID == '0000009788']

The names are unique to the key Agency_Code-Respondent_ID

Applicant_Income_000 Review¶

This is an important risk attribute that shows payment capacity of a given borrower.

Validations:

- Data distribution. Is the income within the state and county ranges and expected trends at each level?
- Limit of detection. Set boundaries to determine possible errors or outliers in the data.
- Missing values. Is the number of missing values as in previous years, is it increasing/decreasing?

cnt

State  As_of_Year
DC     2012          1.542059
       2013          1.454827
       2014          1.502308
DE     2012          0.805354
       2013          0.786691
       2014          0.757506
MD     2012          1.073355
       2013          1.025305
       2014          0.999843
VA     2012          1.020571
       2013          0.975942
       2014          0.948129
WV     2012          0.732593
       2013          0.717106
       2014          0.682683
Name: Applicant_Income_000_num, dtype: float64

cnt = data.groupby(['State','As_of_Year'])['Applicant_Income_000_num'].mean()/data.Applicant_Income_000_num.mean()
ax = cnt.unstack().plot(kind='bar',figsize=(10,5))
ax.set_title("Income Deviation from Region Average",fontsize=18)
ax.set_ylabel("Percentage",fontsize=15)
ax.set_xlabel("State",fontsize=15)
ax.legend(fontsize=12)

<matplotlib.legend.Legend at 0x12e29b1d0>

Lien_Status¶

This attribute provides information about potential risk. The shift in the composition of the applications and loans could represent an underlying recovery risk. When a loan has defaulted the proceedings are returned as according to the lien order.

Market Description¶

cnt = data.groupby(['Agency_Code_Description','Lien_Status_Description'])['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cnt.plot.pie(subplots=True,figsize=(6,6),
             title='',fontsize=13,autopct ='%1.1f%%',
             explode = [0.01 for x in range(len(cnt))])

array([<matplotlib.axes._subplots.AxesSubplot object at 0x13b5cca58>], dtype=object)

cnt = data.groupby(['Agency_Code_Description'])['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cnt.plot.pie(subplots=True,figsize=(6,6),
             title='',fontsize=13,autopct ='%1.1f%%',
             explode = [0.01 for x in range(len(cnt))])

array([<matplotlib.axes._subplots.AxesSubplot object at 0x10ea7d588>], dtype=object)

cnt = data.groupby('State')['Loan_Amount_000'].count()/data.shape[0]
cnt.plot.pie(subplots=True,figsize=(6,6),
             title='Loan distribution by State',fontsize=14,autopct ='%1.1f%%',
             explode = [0.01 for x in range(len(cnt))])

array([<matplotlib.axes._subplots.AxesSubplot object at 0x1157e66a0>], dtype=object)

cnt = data.groupby('State')['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cnt.plot.pie(subplots=True,figsize=(6,6),
             title='Amount distribution by State',fontsize=14,autopct ='%1.1f%%',
             explode = [0.01 for x in range(len(cnt))])

array([<matplotlib.axes._subplots.AxesSubplot object at 0x126039898>], dtype=object)

cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].sum()
cnt = cnt /data.groupby(['As_of_Year'])['Loan_Amount_000'].sum()
cnt.unstack().plot.pie(subplots=True,figsize=(20,5),
             title='Market Share by Year & State',fontsize=17,autopct ='%1.1f%%')

array([<matplotlib.axes._subplots.AxesSubplot object at 0x12f492160>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x12fe02b70>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x12fe4f400>], dtype=object)

cnt = data.groupby('State')['Loan_Amount_000'].count()/data.shape[0]
cnt2 = data.groupby('State')['Loan_Amount_000'].sum()/data.Loan_Amount_000.sum()
cn = (cnt2 - cnt)*100
print(cn)

State
DC    1.604022
DE   -1.320907
MD   -1.407897
VA    2.608773
WV   -1.483991
Name: Loan_Amount_000, dtype: float64

cnt = data.groupby(['State','As_of_Year'])['Applicant_Income_000_num'].mean()

ax = cnt.unstack().plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Income by State & Year',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)

<matplotlib.legend.Legend at 0x124c50a20>

cnt = data.groupby(['State','As_of_Year'])['Loan_Amount_000'].mean()

ax = cnt.unstack().plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Loan Amount by State & Year',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)

<matplotlib.legend.Legend at 0x124cb1dd8>

cnt = data.groupby(['State','As_of_Year'])['Applicant_Income_000_num'].mean()

ax = cnt.unstack().plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Income by State & Year',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)

<matplotlib.legend.Legend at 0x124e908d0>

cnt = data.groupby(['State'])['Loan_Amount_000',
            'Applicant_Income_000_num'].mean()

ax = cnt.plot.bar(figsize=(10,5),fontsize=12)
ax.set_title('Average Income and Loan amount by State ',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=15)
ax.legend(fontsize=15)

<matplotlib.legend.Legend at 0x115647710>

cnt =data.groupby(['As_of_Year'])['Loan_Amount_000']
print(cnt.sum(),cnt.count())

As_of_Year
2012    159886593
2013    133380028
2014     90177176
Name: Loan_Amount_000, dtype: int64 As_of_Year
2012    557888
2013    472957
2014    290313
Name: Loan_Amount_000, dtype: int64

data.groupby(['State','As_of_Year'])['Loan_Amount_000'].sum().unstack().plot(
    title='Loan Amount by State and Year',
    kind='bar', 
    figsize=(10,5))

<matplotlib.axes._subplots.AxesSubplot at 0x130b29048>

data.groupby(['Conventional_Conforming_Flag','As_of_Year'])['Loan_Amount_000'].sum().unstack().plot(
    title='Loan Amount for Conventional & Conforming Loans by Year',
    kind='bar', 
    figsize=(10,5))

<matplotlib.axes._subplots.AxesSubplot at 0x12ef53400>

data.groupby(['State','As_of_Year'])['FFIEC_Median_Family_Income_num'].mean().unstack().plot(
    title='FFIEC Average Income by State and Year',
    kind='bar', 
    figsize=(10,5))

<matplotlib.axes._subplots.AxesSubplot at 0x1189b20b8>

cnt = data.groupby(['State','Conventional_Conforming_Flag'])['Loan_Amount_000'].count() 
cnt = cnt / loan_data.shape[0]
print(cnt.round(2))
ax = cnt.unstack().plot(kind='bar',figsize=(10,5),fontsize=13)
ax.set_title('Market distribution by loan',fontsize=15)
ax.set_ylabel("% of number of loans",fontsize=12)
ax.set_xlabel("State by CC Flag",fontsize=12)

State  Conventional_Conforming_Flag
DC     N                               0.01
       Y                               0.03
DE     N                               0.01
       Y                               0.03
MD     N                               0.11
       Y                               0.23
VA     N                               0.18
       Y                               0.33
WV     N                               0.02
       Y                               0.04
Name: Loan_Amount_000, dtype: float64

<matplotlib.text.Text at 0x13bda8710>

cnt = data.groupby(['State','Conventional_Conforming_Flag'])['Loan_Amount_000'].sum() 
cnt = cnt / loan_data.Loan_Amount_000.sum()
print(cnt.round(2))
ax = cnt.unstack().plot(kind='bar',figsize=(10,5),fontsize=13)
ax.set_title('Market distribution by Amount',fontsize=15)
ax.set_ylabel("Amount percentage of loans",fontsize=14)
ax.set_xlabel("State by CC Flag",fontsize=14)

State  Conventional_Conforming_Flag
DC     N                               0.02
       Y                               0.03
DE     N                               0.01
       Y                               0.02
MD     N                               0.13
       Y                               0.20
VA     N                               0.26
       Y                               0.29
WV     N                               0.02
       Y                               0.02
Name: Loan_Amount_000, dtype: float64

<matplotlib.text.Text at 0x122343128>

# correlation plot using heatmap
cor = data.corr()
mask = np.zeros_like(cor, dtype=np.bool) # hiding upper triangle
mask[np.triu_indices_from(mask,k=1)] = True
f, ax = plt.subplots(figsize=(8, 8))
cmap = sns.diverging_palette(15, 150,center='light', as_cmap=True)
sns.heatmap(cor, mask=mask, cmap=cmap, vmax=.3,square=True, linewidths=1.15, cbar_kws={"shrink": .65}, ax=ax,)

<matplotlib.axes._subplots.AxesSubplot at 0x1494dc278>

scatter_matrix(loan_data[x_attr], alpha=0.2, figsize=(20, 10), diagonal='kde')

data.groupby(['State',
                   'Conventional_Conforming_Flag'])['Loan_Amount_000',
            'Applicant_Income_000_num',
            'FFIEC_Median_Family_Income_num'].mean().round(2).style.format("{:2}")

data.groupby(['Conventional_Conforming_Flag','As_of_Year'])['Loan_Amount_000',
            'Applicant_Income_000_num'].mean().unstack().plot(title='Income and Loan Amount Conventional and Conforming',kind='bar',figsize=(10,5))

<matplotlib.axes._subplots.AxesSubplot at 0x118dc7780>

The average laon amount us increasing while the income of the target market has been declining over the past three years. This is potentially incresing stress on housholds payment capacity. In addition the number of

data.groupby(['Conventional_Conforming_Flag','As_of_Year'])['Loan_Amount_000'].count().unstack().plot(kind='bar',
                                                                                                      figsize=(10,5))

<matplotlib.axes._subplots.AxesSubplot at 0x127dae080>

Clustering¶

def plot_PCA_by4(DTtrans,y_pred,c1=1,c2=2,c3=3,c4=4):
    ''' Plot the principal componets in a 2x2 column figure where the first row corresponds to the first 
         principal component vs the second and third components, and the second row is the second component vs
         the third and fourth component by default. 
         
         Parameters
         ----------
         DTtrans: Numpy array with numeric values. 
         
         y_pred: Cluster assigned to each instance in DTtrans
         
         c1: Principal component to plot in y-axis/row 1. Default to 1.
         
         c2: Principal component to plot in y-axis/row 2 and first comparison of c1. Default to 2.
         
         c3: Principal component for comparison against c1 (row 1, column 2) and c2 (row 2 column 1). Default to 3.
         
         c4: Principal component for comparison against c2. Default to 4.
         '''
    
    plt.figure(figsize=(12, 12))
    plt.subplot(221)
    plt.scatter(DTtrans[:, c1], DTtrans[:, c2], c= y_pred)
    plt.ylabel('PC1')
    plt.xlabel('PC2') 
    plt.title('KMeans Clustering')
    plt.subplot(222)
    plt.scatter(DTtrans[:, c1], DTtrans[:, c3], c= y_pred)
    plt.xlabel('PC3') 
    plt.title('KMeans Clustering')
    plt.subplot(223)
    plt.scatter(DTtrans[:, c2], DTtrans[:, c3], c= y_pred)
    plt.ylabel('PC2')
    plt.xlabel('PC3') 
    plt.subplot(224)
    plt.scatter(DTtrans[:, c2], DTtrans[:, c4], c= y_pred)
    plt.xlabel('PC4')
    plt.show()

def Kmean_cluster(data,n_clusters=8, plot=True,n_components=None,normalize='zscore'):
    '''
    Performs nsupervized clustering using Kmeans and PCA for dimentionality reduction. 
    Returns the both Kmeans and PCA models, the data reduced by PCA and classifications
    
    Parameters:
    -----------
    data: Panda dataframe with the merge data
    
    n_clusters: number of clusters to create, deaulted to skleanr.Kmeans value of 8, 
    
    plot: Bolean, To create cluster plot. Default to True
    
    n_components: Number of component to be created by PCA. Default to the number of attibutes pass in data
    
    normalize: String with normalize function to apply 'zscore' or 'minmax'. Default == 'zscore'
    
    '''
    
    #execute minmax normalization
    minmax = lambda x: ((x-x.min())/(x.max()-x.min()) * (100-0) + 0) #if x.dtypes in [np.float64,no.int64]  else x
    zscore = lambda x: (x - x.mean()) / x.std()
    if normalize == 'zscore':
        data_norm = data[x_attr].apply(zscore) # applying function
    else:
        data_norm = data[x_attr].apply(minmax) # applying function
    # remove null data
    data_norm.dropna(inplace=True)

    colnames= list(data_norm.columns)
    # PCA  
    if n_components == None:
        n_components = len(colnames)
    pca = decomposition.PCA(n_components=n_components)
    # reducing dimensionality 
    DTtrans = pca.fit(data_norm).transform(data_norm)    
    clusters = KMeans(n_clusters=n_clusters, random_state=33).fit_predict(DTtrans)
    kmeans = KMeans()
    kmeans.fit_predict(DTtrans)

    # variance captured by component
    print ('Variance ratio provided by component: \n', pca.explained_variance_ratio_ , '\n')

    # we need to take the first 12 component to get 95% of the variance
    print ('Percentage Variance Captured with:')
    for i in range(1,len(colnames)+1):
        print ('\t %i components %0.1f' % (i,100*sum(pca.explained_variance_ratio_[:i])))


    plt.figure(1, figsize=(7, 5))
    plt.clf()
    plt.plot(pca.explained_variance_, linewidth=3)
    plt.xlabel('Number of components')
    plt.ylabel('Explained variance')
    
        
    print('\nTotal number of clusters created: %i' %(len(kmeans.cluster_centers_)))
    
    if plot:
        plot_PCA_by4(DTtrans,y_pred)
    
    return kmeans, DTtrans, pca, clusters

kmeans, DTtrans, pca, clusters = Kmean_cluster(data,n_clusters=20, plot=True,n_components=None,normalize='minmax')

Variance ratio provided by component: 
 [  2.29197680e-01   1.41614275e-01   1.26784421e-01   9.48884736e-02
   6.39929819e-02   5.61047528e-02   5.25916432e-02   4.42485735e-02
   3.90827050e-02   3.78888250e-02   3.09395469e-02   2.48643974e-02
   1.69151868e-02   1.08053066e-02   9.00253193e-03   7.91993740e-03
   4.53186992e-03   4.36631101e-03   1.96285720e-03   1.19444429e-03
   1.03625329e-03   6.64267680e-05   5.99382881e-07] 

Percentage Variance Captured with:
	 1 components 22.9
	 2 components 37.1
	 3 components 49.8
	 4 components 59.2
	 5 components 65.6
	 6 components 71.3
	 7 components 76.5
	 8 components 80.9
	 9 components 84.9
	 10 components 88.6
	 11 components 91.7
	 12 components 94.2
	 13 components 95.9
	 14 components 97.0
	 15 components 97.9
	 16 components 98.7
	 17 components 99.1
	 18 components 99.6
	 19 components 99.8
	 20 components 99.9
	 21 components 100.0
	 22 components 100.0
	 23 components 100.0

Total number of clusters created: 8

# minmax normalization 
centroids = pan.DataFrame(kmeans.cluster_centers_, columns=x_attr)
name = list(centroids.columns)

for k in range(len(pca.components_)):
    print('PC %i:'%(k+1))
    for i in range(len(pca.components_[0])):
        if abs(pca.components_[k][i])>=.35:
            print('\t%.2f \t %s'%(pca.components_[k][i],name[i]))

PC 1:
	-0.44 	 Loan_Purpose_Description_cat
	0.37 	 Loan_Type_Description_cat
	0.53 	 Conventional_Status_cat
	-0.56 	 Conventional_Conforming_Flag_cat
PC 2:
	-0.39 	 Census_Tract_Number_num
	-0.46 	 FFIEC_Median_Family_Income_num
	-0.71 	 Conforming_Limit_000
PC 3:
	-0.71 	 MSA_MD_num
	-0.51 	 State_Code
PC 4:
	-0.48 	 As_of_Year
	0.70 	 Loan_Purpose_Description_cat
PC 5:
	0.42 	 Agency_Code
	0.77 	 As_of_Year
PC 6:
	-0.57 	 Agency_Code
	0.38 	 As_of_Year
	0.38 	 Loan_Purpose_Description_cat
PC 7:
	-0.38 	 Census_Tract_Number_num
	0.36 	 MSA_MD_num
	-0.43 	 Agency_Code
	-0.46 	 State_Code
	-0.36 	 State_cat
	-0.36 	 Respondent_Name_id
PC 8:
	0.44 	 Agency_Code
	-0.82 	 Respondent_Name_id
PC 9:
	0.87 	 Respondent_ID_num
	-0.37 	 County_Name_cat
PC 10:
	-0.67 	 County_Name_cat
	0.39 	 Conforming_Status_cat
PC 11:
	0.40 	 County_Name_cat
	0.66 	 Conforming_Status_cat
PC 12:
	-0.70 	 County_Code_num
	0.36 	 MSA_MD_num
PC 13:
	0.57 	 Census_Tract_Number_num
	0.50 	 County_Code_num
	-0.39 	 County_Name_cat
PC 14:
	0.86 	 Number_of_Owner_Occupied_Units_num
PC 15:
	1.00 	 Lien_Status_Description_cat
PC 16:
	0.80 	 Loan_Type_Description_cat
PC 17:
	0.79 	 Tract_to_MSA_MD_Income_Pct_num
	-0.49 	 Sequence_Number
PC 18:
	0.48 	 Tract_to_MSA_MD_Income_Pct_num
	0.84 	 Sequence_Number
PC 19:
	0.82 	 FFIEC_Median_Family_Income_num
	-0.48 	 Conforming_Limit_000
PC 20:
	0.63 	 Conventional_Status_cat
	0.50 	 Conforming_Status_cat
	0.58 	 Conventional_Conforming_Flag_cat
PC 21:
	0.54 	 State_Code
	-0.79 	 State_cat
PC 22:
	1.00 	 Applicant_Income_000_num
PC 23:
	1.00 	 Loan_Amount_000

centroids.describe()

datos_mm =[]
for col in data.columns:
    if data[col].dtype in [int,float]:
        temp =[]
        temp.append(col)
        temp.append(data[col].mean())
        temp.append(data[col].std())
        datos_mm.append(temp)
datos_mm = pan.DataFrame(datos_mm,columns=['attr','min','max'])
datos_mm.to_csv('Centroids minmax vals.csv',index=False)

kmeans, DTtrans, pca, clusters = Kmean_cluster(data,n_clusters=20, plot=True,n_components=None,normalize='zscore')

Variance ratio provided by component: 
 [ 0.13615468  0.11167448  0.09384719  0.08306762  0.08202813  0.06082295
  0.05669165  0.04817923  0.04580173  0.04414429  0.04251639  0.03723629
  0.03116526  0.03006547  0.02817814  0.02333021  0.02261443  0.01254846
  0.00546148  0.00200448  0.00124173  0.00066991  0.00055579] 

Percentage Variance Captured with:
	 1 components 13.6
	 2 components 24.8
	 3 components 34.2
	 4 components 42.5
	 5 components 50.7
	 6 components 56.8
	 7 components 62.4
	 8 components 67.2
	 9 components 71.8
	 10 components 76.2
	 11 components 80.5
	 12 components 84.2
	 13 components 87.3
	 14 components 90.3
	 15 components 93.2
	 16 components 95.5
	 17 components 97.8
	 18 components 99.0
	 19 components 99.6
	 20 components 99.8
	 21 components 99.9
	 22 components 99.9
	 23 components 100.0

Total number of clusters created: 8

# zscore normalization 
centroids = pan.DataFrame(kmeans.cluster_centers_, columns=x_attr)
name = list(centroids.columns)

for k in range(len(pca.components_)):
    print('PC %i:'%(k+1))
    for i in range(len(pca.components_[0])):
        if abs(pca.components_[k][i])>=.35:
            print('\t%.2f \t %s'%(pca.components_[k][i],name[i]))

PC 1:
	-0.35 	 County_Code_num
	-0.37 	 State_Code
	-0.37 	 State_cat
PC 2:
	0.37 	 Loan_Type_Description_cat
	0.40 	 Conventional_Status_cat
	-0.47 	 Conventional_Conforming_Flag_cat
PC 3:
	0.44 	 MSA_MD_num
	0.39 	 State_Code
	0.41 	 Conforming_Limit_000
PC 4:
	0.43 	 Applicant_Income_000_num
	0.45 	 Tract_to_MSA_MD_Income_Pct_num
	0.46 	 Conforming_Status_cat
PC 5:
	-0.46 	 Agency_Code
	-0.55 	 Sequence_Number
PC 6:
	-0.35 	 Census_Tract_Number_num
	-0.42 	 Number_of_Owner_Occupied_Units_num
	0.49 	 Lien_Status_Description_cat
	-0.36 	 State_cat
PC 7:
	0.78 	 Lien_Status_Description_cat
PC 8:
	-0.56 	 Respondent_ID_num
	-0.60 	 As_of_Year
	-0.37 	 Respondent_Name_id
PC 9:
	-0.36 	 Respondent_ID_num
	-0.60 	 Agency_Code
	0.53 	 Respondent_Name_id
PC 10:
	-0.40 	 Number_of_Owner_Occupied_Units_num
	-0.53 	 County_Name_cat
PC 11:
	-0.38 	 Number_of_Owner_Occupied_Units_num
	0.53 	 Respondent_ID_num
	-0.50 	 As_of_Year
PC 12:
	0.41 	 Census_Tract_Number_num
	-0.46 	 MSA_MD_num
	0.55 	 County_Name_cat
PC 13:
	0.63 	 Applicant_Income_000_num
	-0.50 	 Conforming_Status_cat
PC 14:
	-0.71 	 Loan_Purpose_Description_cat
	-0.39 	 Conforming_Status_cat
PC 15:
	0.35 	 Number_of_Owner_Occupied_Units_num
	-0.62 	 Tract_to_MSA_MD_Income_Pct_num
PC 16:
	-0.76 	 County_Code_num
	-0.41 	 Number_of_Owner_Occupied_Units_num
PC 17:
	-0.43 	 Agency_Code
	0.66 	 Sequence_Number
	-0.44 	 Respondent_Name_id
PC 18:
	0.64 	 Census_Tract_Number_num
	0.51 	 MSA_MD_num
	-0.37 	 Conforming_Limit_000
PC 19:
	0.79 	 Loan_Type_Description_cat
	-0.37 	 Conventional_Status_cat
	0.45 	 Conventional_Conforming_Flag_cat
PC 20:
	0.59 	 FFIEC_Median_Family_Income_num
	-0.42 	 State_Code
	-0.46 	 Conforming_Limit_000
	0.48 	 State_cat
PC 21:
	0.43 	 FFIEC_Median_Family_Income_num
	0.58 	 State_Code
	-0.48 	 Conforming_Limit_000
	-0.45 	 State_cat
PC 22:
	0.50 	 Loan_Amount_000
	-0.60 	 Conventional_Status_cat
	-0.56 	 Conventional_Conforming_Flag_cat
PC 23:
	0.85 	 Loan_Amount_000
	0.38 	 Conventional_Status_cat

datos_mm =[]
for col in data.columns:
    if data[col].dtype in [int,float]:
        temp =[]
        temp.append(col)
        temp.append(data[col].mean())
        temp.append(data[col].std())
        datos_mm.append(temp)
datos_mm = pan.DataFrame(datos_mm,columns=['attr','mean','std'])

data.loc[data.Respondent_Name =='WELLS FARGO BANK']

a = data.MSA_MD_num.unique()
a.sort()
a

array([ 12580.,  13220.,  13644.,  13980.,  15680.,  16620.,  16820.,
        19060.,  19260.,  20100.,  25180.,  25500.,  26580.,  28700.,
        31340.,  34060.,  37620.,  40060.,  40220.,  41540.,  43524.,
        44420.,  44600.,  47260.,  47894.,  48260.,  48540.,  48864.,
        49020.,     nan])

for i in a:
    if np.isnan(i) == False:
        print(data.loc[data.MSA_MD_num == i].State.unique())

['MD']
['WV']
['MD']
['VA']
['MD']
['WV']
['VA']
['WV' 'MD']
['VA']
['DE']
['WV' 'MD']
['VA']
['WV']
['VA']
['VA']
['WV']
['WV']
['VA']
['VA']
['MD' 'DE']
['MD']
['VA']
['WV']
['VA']
['DC' 'MD' 'VA' 'WV']
['WV']
['WV']
['DE' 'MD']
['VA' 'WV']

	As_of_Year	Respondent_ID	Agency_Code	Respondent_Name_TS	Respondent_City_TS	Respondent_State_TS	Respondent_ZIP_Code	Parent_Name_TS	Parent_City_TS	Parent_State_TS	Parent_ZIP_Code	Assets_000_Panel	Respondent_Name	inst_id_yr
201	2012	0000009788	1	HERGET BANK, N.A.	PEKIN	IL	61555-0839	NaN	NaN	NaN	NaN	267337	HERGET BANK	1-0000009788
2134	2012	0000009788	3	BANK OF BLOOMSDALE	BLOOMSDALE	MO	63627	NaN	NaN	NaN	NaN	193336	BANK OF BLOOMSDALE	3-0000009788
4925	2012	0000009788	5	EGLIN FEDERAL CREDIT UNION	FORT WALTON BEACH	FL	32547	EGLIN FEDERAL CREDIT UNION	FORT WALTON BEACH	FL	32547	1408377	EGLIN FEDERAL CREDIT UNION	5-0000009788
7592	2013	0000009788	1	HERGET BANK, N.A.	PEKIN	IL	61555-0839	HERGET FINANCIAL CORP.	PEKIN	IL	61555-0839	259069	HERGET BANK	1-0000009788
9437	2013	0000009788	3	BANK OF BLOOMSDALE	BLOOMSDALE	MO	63627	NaN	NaN	NaN	NaN	194346	BANK OF BLOOMSDALE	3-0000009788
12177	2013	0000009788	5	EGLIN FEDERAL CREDIT UNION	FORT WALTON BEACH	FL	32547	NaN	NaN	NaN	NaN	1487189	EGLIN FEDERAL CREDIT UNION	5-0000009788
14774	2014	0000009788	1	HERGET BANK NATIONAL ASSOCIATI	PEKIN	IL	61554	NaN	NaN	NaN	NaN	273992	HERGET BANK NATIONAL ASSOCIATI	1-0000009788
16590	2014	0000009788	3	BLOOMSDALE BANK	BLOOMSDALE	MO	63627	NaN	NaN	NaN	NaN	190798	BLOOMSDALE BANK	3-0000009788
19246	2014	0000009788	5	EGLIN FEDERAL CREDIT UNION	FORT WALTON BEACH	FL	32547	NaN	NaN	NaN	NaN	1557516	EGLIN FEDERAL CREDIT UNION	5-0000009788

		Loan_Amount_000	Applicant_Income_000_num	FFIEC_Median_Family_Income_num
State	Conventional_Conforming_Flag
DC	N	649.07	280.68	106130.97
DC	Y	335.66	147.37	106077.82
DE	N	231.7	87.33	72796.18
DE	Y	197.09	94.98	73999.56
MD	N	335.98	133.95	94115.66
MD	Y	250.67	118.67	95078.94
VA	N	411.03	121.03	84763.92
VA	Y	247.68	114.85	87178.69
WV	N	427.01	76.02	59399.34
WV	Y	134.93	87.12	56462.9

	Applicant_Income_000_num	Census_Tract_Number_num	County_Code_num	FFIEC_Median_Family_Income_num	MSA_MD_num	Number_of_Owner_Occupied_Units_num	Respondent_ID_num	Tract_to_MSA_MD_Income_Pct_num	Agency_Code	Loan_Amount_000	...	Conforming_Limit_000	Loan_Purpose_Description_cat	Lien_Status_Description_cat	Loan_Type_Description_cat	State_cat	County_Name_cat	Conventional_Status_cat	Conforming_Status_cat	Conventional_Conforming_Flag_cat	Respondent_Name_id
count	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000	...	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000	8.000000e+00
mean	0.160737	-0.040453	0.001255	-0.030728	0.031083	-0.048491	0.056338	0.016423	-0.003016	0.058433	...	-0.001506	0.000889	0.002614	-0.000820	-0.001232	-0.000222	-0.005759	0.002119	0.000021	4.005864e-07
std	0.730092	0.437840	0.365334	0.322724	0.143000	0.185789	0.265183	0.108464	0.046422	0.241734	...	0.016453	0.003945	0.021132	0.007422	0.004613	0.009935	0.024593	0.011612	0.000132	6.659804e-05
min	-0.481971	-0.465601	-0.440805	-0.751235	-0.173369	-0.471619	-0.165860	-0.117522	-0.100628	-0.069875	...	-0.022395	-0.005270	-0.030478	-0.016343	-0.011938	-0.020910	-0.064537	-0.006544	-0.000133	-9.818891e-05
25%	-0.392693	-0.352628	-0.294260	-0.061338	-0.040808	-0.068908	-0.063417	-0.021982	-0.014310	-0.049660	...	-0.014989	-0.001877	-0.011365	-0.001433	-0.001212	-0.002998	-0.001758	-0.002534	-0.000084	-3.711344e-05
50%	-0.119933	-0.159367	-0.002480	0.025378	0.001003	0.012535	-0.029423	-0.009517	0.010222	-0.019631	...	-0.002052	0.001486	0.006476	0.001206	-0.000055	0.000803	0.001164	-0.000823	-0.000008	5.604213e-06
75%	0.544461	0.148931	0.189173	0.126086	0.080148	0.054491	0.064524	0.026842	0.024108	0.010983	...	0.011420	0.003172	0.020683	0.002828	0.000809	0.005190	0.003648	0.000736	0.000104	5.414455e-05
max	1.336218	0.718242	0.655399	0.298091	0.284404	0.103165	0.676169	0.257533	0.043260	0.651985	...	0.023376	0.006332	0.025909	0.007247	0.002702	0.011857	0.015897	0.030091	0.000224	7.892220e-05

	Agency_Code	Applicant_Income_000	As_of_Year	Census_Tract_Number	County_Code	FFIEC_Median_Family_Income	Loan_Amount_000	MSA_MD	Number_of_Owner_Occupied_Units	Respondent_ID	...	Respondent_City_TS	Respondent_State_TS	Respondent_ZIP_Code	Parent_Name_TS	Parent_City_TS	Parent_State_TS	Parent_ZIP_Code	Assets_000_Panel	Respondent_Name	Respondent_Name_id
20724	9	0221	2012	0204.00	079	00054900	409	16620	00002300	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20725	9	0125	2012	0206.04	079	00054900	260	16620	00000715	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20726	9	0032	2012	0103.00	039	00054900	47	16620	00000736	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20727	9	0185	2012	0204.00	079	00054900	237	16620	00002300	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20728	9	0086	2012	0011.00	039	00054900	204	16620	00001582	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20729	9	0127	2012	0202.00	079	00054900	219	16620	00001738	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20730	9	0117	2012	0019.01	039	00054900	200	16620	00001396	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20731	9	0120	2012	0206.03	079	00054900	282	16620	00002198	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20732	9	0133	2012	0123.00	039	00054900	333	16620	00001805	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20733	9	0090	2012	0105.00	039	00054900	176	16620	00001483	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20734	9	0084	2012	0206.05	079	00054900	229	16620	00002493	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20735	9	0069	2012	0019.02	039	00054900	240	16620	00001552	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20736	9	0180	2012	9585.02	005	00054900	273	16620	00001177	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20737	9	0047	2012	0133.00	039	00054900	180	16620	00000915	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20738	9	0095	2012	0105.00	039	00054900	160	16620	00001483	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20739	9	0121	2012	0132.00	039	00054900	409	16620	00001138	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20740	9	0125	2012	0206.05	079	00054900	223	16620	00002493	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20741	9	0085	2012	0113.02	039	00054900	125	16620	00001686	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20742	9	0296	2012	0206.01	079	00054900	155	16620	00001935	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20743	9	0101	2012	0203.00	079	00054900	154	16620	00001381	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20744	9	0101	2012	0206.05	079	00054900	311	16620	00002493	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20745	9	0110	2012	0105.00	039	00054900	142	16620	00001483	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20746	9	0200	2012	0204.00	079	00054900	272	16620	00002300	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20747	9	0040	2012	0011.00	039	00054900	173	16620	00001582	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20748	9	0102	2012	0106.00	039	00054900	63	16620	00001511	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20749	9	0380	2012	0018.00	039	00054900	360	16620	00000782	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20750	9	0150	2012	0123.00	039	00054900	200	16620	00001805	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20751	9	0148	2012	0204.00	079	00054900	178	16620	00002300	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20752	9	0065	2012	0207.00	079	00054900	166	16620	00001545	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
20753	9	0529	2012	0207.00	079	00054900	107	16620	00001545	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1161490000	WELLS FARGO BANK	1308
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
25838	9	0130	2014	0101.00	027	00052000	235	NA	00001503	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25839	9	0046	2014	0501.00	197	00052000	64	NA	00001528	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25840	9	0233	2014	0403.00	099	00052000	258	NA	00002274	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25841	9	0119	2014	0203.00	133	00052000	260	NA	00002135	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25842	9	0084	2014	0405.00	099	00052000	323	NA	00000792	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25843	9	0015	2014	0203.00	133	00052000	58	NA	00002135	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25844	9	0037	2014	9301.00	163	00052000	125	NA	00002079	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25845	9	0091	2014	9302.00	163	00052000	172	NA	00001559	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25846	9	0033	2014	9312.00	195	00052000	84	NA	00001587	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25847	9	0020	2014	0701.01	640	00052000	31	NA	00001415	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25848	9	0060	2014	0001.00	590	00052000	69	NA	00001841	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25849	9	NA	2014	9502.01	109	00052000	55	NA	00001238	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25850	9	0080	2014	9505.00	109	00052000	152	NA	00002168	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25851	9	0039	2014	0403.00	171	00052000	119	NA	00001271	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25852	9	0062	2014	9312.00	195	00052000	149	NA	00001587	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25853	9	0106	2014	1101.02	137	00052000	43	NA	00002696	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25854	9	0052	2014	1101.03	137	00052000	140	NA	00003393	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25855	9	0044	2014	2001.00	175	00052000	120	NA	00001366	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25856	9	0051	2014	0803.00	035	00052000	170	NA	00001739	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25857	9	0051	2014	1103.00	137	00052000	176	NA	00001958	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25858	9	0092	2014	8801.01	081	00052000	134	NA	00001282	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25859	9	0107	2014	0406.00	171	00052000	170	NA	00000965	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25860	9	0058	2014	9511.00	119	00052000	154	NA	00000864	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25861	9	0039	2014	0407.00	171	00052000	200	NA	00001076	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25862	9	0044	2014	0401.00	171	00052000	133	NA	00002112	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25863	9	0113	2014	2004.00	175	00052000	243	NA	00001686	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25864	9	0057	2014	0014.00	590	00052000	124	NA	00001073	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25865	9	0078	2014	0106.02	089	00052000	138	NA	00001199	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25866	9	0030	2014	0110.01	143	00052000	61	NA	00001347	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308
25867	9	0292	2014	0401.00	099	00052000	91	NA	00001020	0000451965	...	WINSTON-SALEM	NC	27150	WELLS FARGO & CO	SAN FRANCISCO	CA	94104-1207	1373600000	WELLS FARGO BANK	1308

	Applicant_Income_000_num	Census_Tract_Number_num	County_Code_num	FFIEC_Median_Family_Income_num	MSA_MD_num	Number_of_Owner_Occupied_Units_num	Respondent_ID_num	Tract_to_MSA_MD_Income_Pct_num	Agency_Code	Loan_Amount_000	...	Conforming_Limit_000	Loan_Purpose_Description_cat	Lien_Status_Description_cat	Loan_Type_Description_cat	State_cat	County_Name_cat	Conventional_Status_cat	Conforming_Status_cat	Conventional_Conforming_Flag_cat	Respondent_Name_id
count	1203305.00	1319481.00	1320321.00	1319481.00	1225941.00	1319284.00	9.106370e+05	1318445.00	1321158.00	1321158.00	...	1320321.00	1321158.00	1321158.00	1321158.00	1321158.00	1321158.00	1321158.00	1321158.00	1321158.00	1321158.00
mean	118.02	3897.81	144.59	87671.39	35884.43	1383.55	1.379205e+09	117.17	6.85	290.23	...	532.91	0.64	0.01	0.59	2.51	92.77	0.30	0.05	0.67	753.78
std	122.70	3357.40	229.67	19541.82	15166.98	565.35	2.791193e+09	37.76	2.49	965.78	...	86.91	0.48	0.12	1.05	0.83	59.17	0.46	0.21	0.47	404.34
min	0.00	1.00	1.00	45300.00	12580.00	4.00	4.700000e+01	4.96	1.00	1.00	...	417.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00
25%	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	7.00	153.00	...	NaN	0.00	0.00	0.00	2.00	46.00	0.00	0.00	0.00	407.00
50%	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	7.00	235.00	...	NaN	1.00	0.00	0.00	3.00	91.00	0.00	0.00	1.00	819.00
75%	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	9.00	347.00	...	NaN	1.00	0.00	1.00	3.00	140.00	1.00	0.00	1.00	1110.00
max	9999.00	9902.00	840.00	113400.00	49020.00	3632.00	9.731401e+09	350.67	9.00	99625.00	...	625.00	1.00	1.00	3.00	4.00	210.00	1.00	1.00	1.00	1335.00

	Loan_Purpose_Description	Lien_Status_Description	Loan_Type_Description	State	County_Name	Conventional_Status	Conforming_Status	Conventional_Conforming_Flag
count	1321158	1321158	1321158	1321158	1320321	1321158	1321158	1321158
unique	2	2	4	5	210	2	2	2
top	Refinance	First Lien	Conventional	VA	FAIRFAX	Conventional	Conforming	Y
freq	843030	1301377	929170	679703	125375	929170	1258551	882206