

Predict Loan Eligibility for Dream Housing Finance company
Dream Housing Finance company deals in all kinds of home loans. They have presence across all urban, semi urban and rural areas. Customer first applies for home loan and after that company validates the customer eligibility for loan.
Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have provided a dataset to identify the customers segments that are eligible for loan amount so that they can specifically target these customers.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df_train = pd.read_csv('/content/drive/My Drive/loan_prediction/train_ctrUa4K (2).csv')
test = pd.read_csv('/content/drive/My Drive/loan_prediction/test_lAUu6dG (2).csv')
sub = pd.read_csv('/content/drive/My Drive/loan_prediction/sample_submission_49d68Cx.csv')
df_train.shape,test.shape
df_train['train_or_test']='train'
test['train_or_test']='test'
train=pd.concat([df_train,test])
train
train.isna().sum()
train.Loan_Status.value_counts().plot(kind='bar')
train.Loan_Status.value_counts()
train.Gender.value_counts().plot(kind='bar')
train.Gender.unique()
train['Gender'].fillna('Male',axis=0,inplace=True)
train.Married.unique()
train.Married.value_counts().plot(kind='bar')
train['Married'].fillna('Yes',axis=0,inplace=True)
train['Dependents'].fillna('2',axis=0,inplace=True)
train.Self_Employed.value_counts().plot(kind='bar')
train['Self_Employed'].fillna('Yes',axis=0,inplace=True)
train.nunique()
train['LoanAmount'] = train['LoanAmount']*1000
train['LoanAmount']
train['Wallet_Size'] = train['CoapplicantIncome']+train['ApplicantIncome']
train
a= train[train['LoanAmount'].isna()]
a = pd.DataFrame(a)
b= train[train['LoanAmount'].notna()]
b = pd.DataFrame(b)
b = b[['Loan_ID','Wallet_Size','Loan_Amount_Term','LoanAmount']]
a = a[['Loan_ID','Wallet_Size','Loan_Amount_Term']]
a.set_index('Loan_ID',inplace=True)
b.set_index('Loan_ID',inplace=True)
from sklearn.linear_model import LinearRegression
b.dropna(inplace=True)
y = b['LoanAmount']
X = b.drop('LoanAmount',axis=1)
model = LinearRegression()
model.fit(X, y)
X_predict = a # put the dates of which you want to predict kwh here
y_predict = model.predict(X_predict)
l = pd.DataFrame({'S':y_predict},index=a.index)
l.round(2)
train =pd.merge(train,l,how='left', on='Loan_ID')
train
train['LoanAmount'].fillna(0,inplace=True)
train['S'].fillna(0,inplace=True)
train['LoanAmount'] = (train['LoanAmount']+train['S']).round()
train.drop("S",inplace=True,axis=1)
train.isna().sum()
train
a= train[train['Loan_Amount_Term'].isna()]
a = pd.DataFrame(a)
b= train[train['Loan_Amount_Term'].notna()]
b = pd.DataFrame(b)
b = b[['Wallet_Size','LoanAmount','Loan_Amount_Term',]]
a = a[['Wallet_Size','LoanAmount']]
from sklearn.linear_model import LinearRegression
b.dropna(inplace=True)
y = b['Loan_Amount_Term']
X = b.drop('Loan_Amount_Term',axis=1)
model = LinearRegression()
model.fit(X, y)
X_predict = a # put the dates of which you want to predict kwh here
y_predict = model.predict(X_predict)
l = pd.DataFrame({'S':y_predict})
l
train.Loan_Amount_Term.unique()
train['Loan_Amount_Term'].fillna(360,inplace=True)
train['EMI'] = train['LoanAmount']/train['Loan_Amount_Term']
train['EMI'] = train['EMI'].round()
train['Wallet_Share'] = (train['EMI'] /train['Wallet_Size'])*100
train['Wallet_Share'] = train['Wallet_Share'].round(2)
train.isna().sum()
a= train[train['Credit_History'].isna()]
a = pd.DataFrame(a)
b= train[train['Credit_History'].notna()]
b = pd.DataFrame(b)
b = b[['Wallet_Size','LoanAmount','Credit_History','Loan_Amount_Term','EMI',"Wallet_Share"]]
a = a[['Wallet_Size','LoanAmount','Loan_Amount_Term','EMI',"Wallet_Share"]]
from sklearn import linear_model
logistic = linear_model.LogisticRegression()
y = b['Credit_History']
X = b.drop('Credit_History',axis=1)
y = np.ravel(y)
logistic.fit(X, y)
preds = logistic.predict(a)
preds
train['Credit_History'].fillna(1.0,inplace=True)
train.isna().sum()
train.to_csv('/content/drive/My Drive/loan_prediction/updatedtrain.csv')
train.columns
train.dtypes
train = train[['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
'Self_Employed','Wallet_Size', 'EMI', 'Wallet_Share','LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status','train_or_test']]
train.head(3)
train = train.astype({"Wallet_Size": int, "EMI": int,"LoanAmount": int,"Loan_Amount_Term": int,'Credit_History':int})
train.head(3)
cat_cols = train.select_dtypes(include=object)
cat_cols
num_cols = train.select_dtypes(exclude=object)
num_cols
train.nunique()
Gender_map={
'Female': 0,
'Male': 1}
Married_map = {'No': 0, 'Yes': 1}
Property_Area_map = {'Urban':0, 'Rural':1, 'Semiurban':2}
Education_map={'Not Graduate': 0,'Graduate': 1}
Loan_Status_map={'N': 0,'Y': 1}
Dependents_map={'0': 0,
'1': 1,
'2': 2,
'3+': 3}
Self_Employed_map ={'No': 0, 'Yes': 1, 'unknown': 2}
Loan_term_map = {360: 2,
120: 1,
240: 3,
180: 5,
300: 0,
60: 4,
480: 7,
36: 10,
84: 8,
12: 9,
350: 6,
6: 11}
# train['Gender'] = train['Gender'].map(Gender_map)
# train['Married'] = train['Married'].map(Married_map)
# train['Property_Area'] = train['Property_Area'].map(Property_Area_map)
# train['Education'] = train['Education'].map(Education_map)
# train['Dependents'] = train['Dependents'].map(Dependents_map)
# train['Self_Employed'] = train['Self_Employed'].map(Self_Employed_map)
# train['Loan_Amount_Term'] = train['Loan_Amount_Term'].map(Loan_term_map)
# train['Loan_Status'] = train['Loan_Status'].map(Loan_Status_map)
train.dtypes
train
train['mean_LoanAmount_per_Gender']=train.groupby(['Gender'])['LoanAmount'].transform('mean')
train['mean_LoanAmount_per_Married']=train.groupby(['Married'])['LoanAmount'].transform('mean')
train['mean_LoanAmount_per_bed']=train.groupby(['Dependents'])['LoanAmount'].transform('mean')
train['mean_LoanAmount_per_department']=train.groupby(['Education'])['LoanAmount'].transform('mean')
train['mean_LoanAmount_per_Self_Employed']=train.groupby(['Self_Employed'])['LoanAmount'].transform('mean')
train['mean_LoanAmount_per_Loan_Amount_Term']=train.groupby(['Loan_Amount_Term'])['LoanAmount'].transform('mean')
train['mean_LoanAmount_per_Credit_History']=train.groupby(['Credit_History'])['LoanAmount'].transform('mean')
train['mean_LoanAmount_per_Property_Area']=train.groupby(['Property_Area'])['LoanAmount'].transform('mean')
train['sum_LoanAmount_per_Gender']=train.groupby(['Gender'])['LoanAmount'].transform('sum')
train['sum_LoanAmount_per_Married']=train.groupby(['Married'])['LoanAmount'].transform('sum')
train['sum_LoanAmount_per_bed']=train.groupby(['Dependents'])['LoanAmount'].transform('sum')
train['sum_LoanAmount_per_department']=train.groupby(['Education'])['LoanAmount'].transform('sum')
train['sum_LoanAmount_per_Self_Employed']=train.groupby(['Self_Employed'])['LoanAmount'].transform('sum')
train['sum_LoanAmount_per_Loan_Amount_Term']=train.groupby(['Loan_Amount_Term'])['LoanAmount'].transform('sum')
train['sum_LoanAmount_per_Credit_History']=train.groupby(['Credit_History'])['LoanAmount'].transform('sum')
train['sum_LoanAmount_per_Property_Area']=train.groupby(['Property_Area'])['LoanAmount'].transform('sum')
train['max_LoanAmount_per_Gender']=train.groupby(['Gender'])['LoanAmount'].transform('max')
train['max_LoanAmount_per_Married']=train.groupby(['Married'])['LoanAmount'].transform('max')
train['max_LoanAmount_per_bed']=train.groupby(['Dependents'])['LoanAmount'].transform('max')
train['max_LoanAmount_per_department']=train.groupby(['Education'])['LoanAmount'].transform('max')
train['max_LoanAmount_per_Self_Employed']=train.groupby(['Self_Employed'])['LoanAmount'].transform('max')
train['max_LoanAmount_per_Loan_Amount_Term']=train.groupby(['Loan_Amount_Term'])['LoanAmount'].transform('max')
train['max_LoanAmount_per_Credit_History']=train.groupby(['Credit_History'])['LoanAmount'].transform('max')
train['max_LoanAmount_per_Property_Area']=train.groupby(['Property_Area'])['LoanAmount'].transform('max')
train['min_LoanAmount_per_Gender']=train.groupby(['Gender'])['LoanAmount'].transform('min')
train['min_LoanAmount_per_Married']=train.groupby(['Married'])['LoanAmount'].transform('min')
train['min_LoanAmount_per_bed']=train.groupby(['Dependents'])['LoanAmount'].transform('min')
train['min_LoanAmount_per_department']=train.groupby(['Education'])['LoanAmount'].transform('min')
train['min_LoanAmount_per_Self_Employed']=train.groupby(['Self_Employed'])['LoanAmount'].transform('min')
train['min_LoanAmount_per_Loan_Amount_Term']=train.groupby(['Loan_Amount_Term'])['LoanAmount'].transform('min')
train['min_LoanAmount_per_Credit_History']=train.groupby(['Credit_History'])['LoanAmount'].transform('min')
train['min_LoanAmount_per_Property_Area']=train.groupby(['Property_Area'])['LoanAmount'].transform('min')
train.set_index('Loan_ID',inplace=True)
train = pd.get_dummies(train,drop_first=True)
train
train_df=train.loc[train.train_or_test_train.isin([1])]
test=train.loc[train.train_or_test_train.isin([0])]
train_df.drop(columns={'train_or_test_train'},axis=1,inplace=True)
test.drop(columns={'train_or_test_train'},axis=1,inplace=True)
train_df
from sklearn.utils import resample
upsample_data = train_df.copy()
majority= upsample_data[upsample_data['Loan_Status_Y']==1]
minority = upsample_data[upsample_data['Loan_Status_Y']==0]
minority_upsampled = resample(minority,replace = True,n_samples =422,random_state = 42 )
del upsample_data
upsample_data = pd.concat([majority,minority_upsampled])
sns.countplot(upsample_data['Loan_Status_Y'])
train_df = upsample_data
train_df.columns
ref_cols = train_df[['Wallet_Size', 'EMI', 'Wallet_Share', 'LoanAmount', 'Loan_Amount_Term','mean_LoanAmount_per_Gender', 'mean_LoanAmount_per_Married',
'mean_LoanAmount_per_bed', 'mean_LoanAmount_per_department',
'mean_LoanAmount_per_Self_Employed',
'mean_LoanAmount_per_Loan_Amount_Term',
'mean_LoanAmount_per_Credit_History',
'mean_LoanAmount_per_Property_Area', 'sum_LoanAmount_per_Gender',
'sum_LoanAmount_per_Married', 'sum_LoanAmount_per_bed',
'sum_LoanAmount_per_department', 'sum_LoanAmount_per_Self_Employed',
'sum_LoanAmount_per_Loan_Amount_Term',
'sum_LoanAmount_per_Credit_History', 'sum_LoanAmount_per_Property_Area',
'max_LoanAmount_per_Gender', 'max_LoanAmount_per_Married',
'max_LoanAmount_per_bed', 'max_LoanAmount_per_department',
'max_LoanAmount_per_Self_Employed',
'max_LoanAmount_per_Loan_Amount_Term',
'max_LoanAmount_per_Credit_History', 'max_LoanAmount_per_Property_Area',
'min_LoanAmount_per_Gender', 'min_LoanAmount_per_Married',
'min_LoanAmount_per_bed', 'min_LoanAmount_per_department',
'min_LoanAmount_per_Self_Employed',
'min_LoanAmount_per_Loan_Amount_Term',
'min_LoanAmount_per_Credit_History',
'min_LoanAmount_per_Property_Area']]
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
# train_df.set_index('Loan_ID',inplace=True)
scaled_train = ss.fit_transform(train_df[ref_cols.columns])
scaled_train = pd.DataFrame(scaled_train,index = train_df.index, columns = ['Wallet_Size', 'EMI', 'Wallet_Share', 'LoanAmount', 'Loan_Amount_Term','mean_LoanAmount_per_Gender', 'mean_LoanAmount_per_Married',
'mean_LoanAmount_per_bed', 'mean_LoanAmount_per_department',
'mean_LoanAmount_per_Self_Employed',
'mean_LoanAmount_per_Loan_Amount_Term',
'mean_LoanAmount_per_Credit_History',
'mean_LoanAmount_per_Property_Area', 'sum_LoanAmount_per_Gender',
'sum_LoanAmount_per_Married', 'sum_LoanAmount_per_bed',
'sum_LoanAmount_per_department', 'sum_LoanAmount_per_Self_Employed',
'sum_LoanAmount_per_Loan_Amount_Term',
'sum_LoanAmount_per_Credit_History', 'sum_LoanAmount_per_Property_Area',
'max_LoanAmount_per_Gender', 'max_LoanAmount_per_Married',
'max_LoanAmount_per_bed', 'max_LoanAmount_per_department',
'max_LoanAmount_per_Self_Employed',
'max_LoanAmount_per_Loan_Amount_Term',
'max_LoanAmount_per_Credit_History', 'max_LoanAmount_per_Property_Area',
'min_LoanAmount_per_Gender', 'min_LoanAmount_per_Married',
'min_LoanAmount_per_bed', 'min_LoanAmount_per_department',
'min_LoanAmount_per_Self_Employed',
'min_LoanAmount_per_Loan_Amount_Term',
'min_LoanAmount_per_Credit_History',
'min_LoanAmount_per_Property_Area'])
# test.set_index('Loan_ID',inplace=True)
scaled_test = ss.fit_transform(test[ref_cols.columns])
scaled_test = pd.DataFrame(scaled_test,index=test.index, columns = ['Wallet_Size', 'EMI', 'Wallet_Share', 'LoanAmount', 'Loan_Amount_Term','mean_LoanAmount_per_Gender', 'mean_LoanAmount_per_Married',
'mean_LoanAmount_per_bed', 'mean_LoanAmount_per_department',
'mean_LoanAmount_per_Self_Employed',
'mean_LoanAmount_per_Loan_Amount_Term',
'mean_LoanAmount_per_Credit_History',
'mean_LoanAmount_per_Property_Area', 'sum_LoanAmount_per_Gender',
'sum_LoanAmount_per_Married', 'sum_LoanAmount_per_bed',
'sum_LoanAmount_per_department', 'sum_LoanAmount_per_Self_Employed',
'sum_LoanAmount_per_Loan_Amount_Term',
'sum_LoanAmount_per_Credit_History', 'sum_LoanAmount_per_Property_Area',
'max_LoanAmount_per_Gender', 'max_LoanAmount_per_Married',
'max_LoanAmount_per_bed', 'max_LoanAmount_per_department',
'max_LoanAmount_per_Self_Employed',
'max_LoanAmount_per_Loan_Amount_Term',
'max_LoanAmount_per_Credit_History', 'max_LoanAmount_per_Property_Area',
'min_LoanAmount_per_Gender', 'min_LoanAmount_per_Married',
'min_LoanAmount_per_bed', 'min_LoanAmount_per_department',
'min_LoanAmount_per_Self_Employed',
'min_LoanAmount_per_Loan_Amount_Term',
'min_LoanAmount_per_Credit_History',
'min_LoanAmount_per_Property_Area'])
scaled_train = scaled_train
scaled_train.round(2)
scaled_test = scaled_test
scaled_test.round(2)
train_df.drop(ref_cols,axis=1,inplace=True)
test.drop(ref_cols,axis=1,inplace=True)
train_df
train =pd.merge(train_df,scaled_train,how='left', on='Loan_ID')
test =pd.merge(test,scaled_test,how='left', on='Loan_ID')
train
train.columns
test.drop('Loan_Status_Y',inplace=True,axis=1)
to_drop = train[['mean_LoanAmount_per_Gender', 'mean_LoanAmount_per_Married',
'mean_LoanAmount_per_bed', 'mean_LoanAmount_per_department',
'mean_LoanAmount_per_Self_Employed',
'mean_LoanAmount_per_Loan_Amount_Term',
'mean_LoanAmount_per_Property_Area', 'sum_LoanAmount_per_Gender',
'sum_LoanAmount_per_Married', 'sum_LoanAmount_per_bed',
'sum_LoanAmount_per_department', 'sum_LoanAmount_per_Self_Employed',
'sum_LoanAmount_per_Loan_Amount_Term',
'sum_LoanAmount_per_Credit_History', 'sum_LoanAmount_per_Property_Area',
'max_LoanAmount_per_Gender']]
to_drop_test = test[['mean_LoanAmount_per_Gender', 'mean_LoanAmount_per_Married',
'mean_LoanAmount_per_bed', 'mean_LoanAmount_per_department',
'mean_LoanAmount_per_Self_Employed',
'mean_LoanAmount_per_Loan_Amount_Term',
'mean_LoanAmount_per_Property_Area', 'sum_LoanAmount_per_Gender',
'sum_LoanAmount_per_Married', 'sum_LoanAmount_per_bed',
'sum_LoanAmount_per_department', 'sum_LoanAmount_per_Self_Employed',
'sum_LoanAmount_per_Loan_Amount_Term',
'sum_LoanAmount_per_Credit_History', 'sum_LoanAmount_per_Property_Area',
'max_LoanAmount_per_Gender']]
train.drop(to_drop,axis=1,inplace=True)
test.drop(to_drop_test,axis=1,inplace=True)
train
test
from sklearn.model_selection import train_test_split
import xgboost as xgb
y = train['Loan_Status_Y']
X =train.drop('Loan_Status_Y',axis = 1)
X_train, X_test,y_train, y_test = train_test_split(X, y,test_size = 0.2,random_state = 0)
y_train
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
clf = xgb.XGBClassifier()
param_grid = {
'silent': [False],
'max_depth': [6, 10, 15, 20],
'learning_rate': [0.001, 0.01, 0.1, 0.2, 0,3],
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
'gamma': [0, 0.25, 0.5, 1.0],
'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
'n_estimators': [100]}
fit_params = {'eval_metric': 'mlogloss',
'early_stopping_rounds': 10,
'eval_set': [(X_test, y_test)]}
rs_clf = RandomizedSearchCV(clf, param_grid, n_iter=20,
n_jobs=1, verbose=2, cv=2,
scoring='neg_log_loss', refit=False, random_state=42)
print("Randomized search..")
rs_clf.fit(X_train, y_train)
# print("Randomized search time:", time.time() - search_time_start)
best_score = rs_clf.best_score_
best_params = rs_clf.best_params_
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
print('%s: %r' % (param_name, best_params[param_name]))
X_train
model = xgb.XGBClassifier(max_depth = 15,eval_metric = "auc",min_child_weight = 0.5,subsample = 0.8,colsample_bytree = 0.6,colsample_bylevel=0.4,n_estimators=100,
gamma=.8,reg_lambda = 10.0,silent = False,learning_rate = .02)
eval_set = [(X_train,y_train),(X_test,y_test)]
model.fit(X_train,y_train.values.ravel(),early_stopping_rounds= 500,eval_metric= ['auc','error'],eval_set = eval_set,verbose =500)
eval_score = accuracy_score(y_test, model.predict(X_test))
print('Eval ACC: {}'.format(eval_score))
params = {}
params['learning_rate'] = 0.01
params['max_depth'] = 6
params['n_estimators'] = 500
params['objective'] = 'binary'
params['boosting_type'] = 'gbdt'
params['subsample'] = 0.9
params['random_state'] = 42
params['colsample_bytree']=0.9
params['min_data_in_leaf'] = 62
params['reg_alpha'] = 0.7
params['reg_lambda'] = 1.11
import lightgbm as lgb
clf = lgb.LGBMClassifier(**params)
clf.fit(X_train, y_train, early_stopping_rounds=500, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['accuracy','auc'], verbose=True)
preds = clf.predict(X_test)
eval_score = accuracy_score(y_test, clf.predict(X_test))
prediction_test = clf.predict(test)
print('Eval ACC: {}'.format(eval_score))
prediction_test
from lightgbm import LGBMClassifier
clf = LGBMClassifier(n_estimators=10000,
learning_rate=0.5,
min_child_samples=10,
random_state=1,
colsample_bytree=0.8,
reg_alpha=2,
reg_lambda=2)
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50,
eval_metric = 'auc', early_stopping_rounds = 100)
eval_score = accuracy_score(y_test, clf.predict(X_test))
print('Eval ACC: {}'.format(eval_score))
preds = clf.predict(test)
preds
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)
dtest = xgb.DMatrix(test)
## fixed parameters
num_rounds=20 # number of boosting iterations
param = {'silent':1,
'min_child_weight':1,
'objective':'binary:logistic',
'eval_metric':'auc',
'seed' : 1234}
from collections import OrderedDict
ratio_neg_to_pos = sum(y_train==0)/sum(y_train==1) ## = 608
print('Ratio of negative to positive instances: {:6.1f}'.format(ratio_neg_to_pos))
## parameters to be tuned
tune_dic = OrderedDict()
tune_dic['max_depth']= [5,10,15,20,25] ## maximum tree depth
tune_dic['subsample']=[0.5,0.6,0.7,0.8,0.9,1.0] ## proportion of training instances used in trees
tune_dic['colsample_bytree']= [0.5,0.6,0.7,0.8,0.9,1.0] ## subsample ratio of columns
tune_dic['eta']= [0.01,0.05,0.10,0.20,0.30,0.40] ## learning rate
tune_dic['gamma']= [0.00,0.05,0.10,0.15,0.20] ## minimum loss function reduction required for a split
tune_dic['scale_pos_weight']=[30,40,50,300,400,500,600,700] ## relative weight of positive/negative instances
lengths = [len(lst) for lst in tune_dic.values()]
combs=1
for i in range(len(lengths)):
combs *= lengths[i]
print('Total number of combinations: {:16d}'.format(combs))
maxiter=100
columns=[*tune_dic.keys()]+['F-Score','Best F-Score','auc']
results = pd.DataFrame(index=range(maxiter), columns=columns) ## dataframe to hold training results
def perf_measures(preds, labels, print_conf_matrix=False):
act_pos=sum(labels==1) ## actual positive
act_neg=len(labels) - act_pos ## actual negative
pred_pos=sum(1 for i in range(len(preds)) if (preds[i]>=0.5)) ## predicted positive
true_pos=sum(1 for i in range(len(preds)) if (preds[i]>=0.5) & (labels[i]==1)) ## predicted negative
false_pos=pred_pos - true_pos ## false positive
false_neg=act_pos-true_pos ## false negative
true_neg=act_neg-false_pos ## true negative
precision = true_pos/pred_pos ## tp/(tp+fp) percentage of correctly classified predicted positives
recall = true_pos/act_pos ## tp/(tp+fn) percentage of positives correctly classified
f_score = 2*precision*recall/(precision+recall)
if print_conf_matrix:
print('\nconfusion matrix')
print('----------------')
print( 'tn:{:6d} fp:{:6d}'.format(true_neg,false_pos))
print( 'fn:{:6d} tp:{:6d}'.format(false_neg,true_pos))
return(f_score)
def do_train(cur_choice, param, train,train_s,trainY,valid,valid_s,validY,print_conf_matrix=False):
## train with given fixed and variable parameters
## and report the F-score on the validation dataset
print('Parameters:')
for (key,value) in cur_choice.items():
print(key,': ',value,' ',end='')
param[key]=value
print('\n')
## the commented-out segment below uses a watchlist to monitor the progress of the boosting iterations
## evallist = [(train,train_s), (valid,valid_s)]
## model = xgb.train( param, train, num_boost_round=num_rounds,
## evals=evallist,verbose_eval=False)
model = xgb.train( param, train, num_boost_round=num_rounds)
preds = model.predict(valid)
labels = valid.get_label()
f_score = perf_measures(preds, labels,print_conf_matrix)
return(f_score, model)
eval_score = accuracy_score(labels, model.predict(valid))
print('Eval ACC: {}'.format(eval_score))
def next_choice(cur_params=None):
## returns a random combination of the variable parameters (if cur_params=None)
## or a random neighboring combination from cur_params
if cur_params:
## chose parameter to change
## parameter name and current value
choose_param_name, cur_value = random.choice(list(cur_choice.items())) ## parameter name
all_values = list(tune_dic[choose_param_name]) ## all values of selected parameter
cur_index = all_values.index(cur_value) ## current index of selected parameter
if cur_index==0: ## if it is the first in the range select the second one
next_index=1
elif cur_index==len(all_values)-1: ## if it is the last in the range select the previous one
next_index=len(all_values)-2
else: ## otherwise select the left or right value randomly
direction=np.random.choice([-1,1])
next_index=cur_index + direction
next_params = dict((k,v) for k,v in cur_params.items())
next_params[choose_param_name] = all_values[next_index] ## change the value of the selected parameter
print('selected move: {:10s}: from {:6.2f} to {:6.2f}'.
format(choose_param_name, cur_value, all_values[next_index] ))
else: ## generate a random combination of parameters
next_params=dict()
for i in range(len(tune_dic)):
key = [*tune_dic.keys()][i]
values = [*tune_dic.values()][i]
next_params[key] = np.random.choice(values)
return(next_params)
import random
random.seed(1234)
import time
t0 = time.clock()
T=0.40
best_params = dict() ## initialize dictionary to hold the best parameters
best_f_score = -1. ## initialize best f-score
prev_f_score = -1. ## initialize previous f-score
prev_choice = None ## initialize previous selection of parameters
weights = list(map(lambda x: 10**x, [0,1,2,3,4])) ## weights for the hash function
hash_values=set()
for iter in range(maxiter):
print('\nIteration = {:5d} T = {:12.6f}'.format(iter,T))
## find next selection of parameters not visited before
while True:
cur_choice=next_choice(prev_choice) ## first selection or selection-neighbor of prev_choice
## indices of the selections in alphabetical order of the parameters
indices=[tune_dic[name].index(cur_choice[name]) for name in sorted([*tune_dic.keys()])]
## check if selection has already been visited
hash_val = sum([i*j for (i, j) in zip(weights, indices)])
if hash_val in hash_values:
print('\nCombination revisited - searching again')
# tmp=abs(results.loc[:,[*cur_choice.keys()]] - list(cur_choice.values()))
# tmp=tmp.sum(axis=1)
# if any(tmp==0): ## selection has already been visited
# print('\nCombination revisited - searching again')
else:
hash_values.add(hash_val)
break ## break out of the while-loop
## train the model and obtain f-score on the validation dataset
f_score,model=do_train(cur_choice, param, dtrain,'train',y_train,dvalid,'valid',y_test)
## store the parameters
results.loc[iter,[*cur_choice.keys()]]=list(cur_choice.values())
print(' F-Score: {:6.2f} previous: {:6.2f} best so far: {:6.2f}'.format(f_score, prev_f_score, best_f_score))
if f_score > prev_f_score:
print(' Local improvement')
## accept this combination as the new starting point
prev_f_score = f_score
prev_choice = cur_choice
## update best parameters if the f-score is globally better
if f_score > best_f_score:
best_f_score = f_score
print(' Global improvement - best f-score updated')
for (key,value) in prev_choice.items():
best_params[key]=value
else: ## f-score is smaller than the previous one
## accept this combination as the new starting point with probability exp(-(1.6 x f-score decline)/temperature)
rnd = random.random()
diff = f_score-prev_f_score
thres=np.exp(1.3*diff/T)
if rnd <= thres:
print(' Worse result. F-Score change: {:8.4f} threshold: {:6.4f} random number: {:6.4f} -> accepted'.
format(diff, thres, rnd))
prev_f_score = f_score
prev_choice = cur_choice
else:
## do not update previous f-score and previous choice
print(' Worse result. F-Score change: {:8.4f} threshold: {:6.4f} random number: {:6.4f} -> rejected'.
format(diff, thres, rnd))
## store results
results.loc[iter,'F-Score']=f_score
results.loc[iter,'Best F-Score']=best_f_score
if iter % 5 == 0: T=0.85*T ## reduce temperature every 5 iterations and continue
print('\n{:6.1f} minutes process time\n'.format((time.clock() - t0)/60))
print('Best variable parameters found:\n')
print(best_params)
from pylab import rcParams
rcParams['figure.figsize'] = 15, 10
print('\nBest parameters found:\n')
print(best_params)
print('\nEvaluation on the test dataset\n')
best_f_score,best_model=do_train(best_params, param, dtrain,'train',y_train,dvalid,'valid',y_test,print_conf_matrix=True)
print('\nF-score on the test dataset: {:6.2f}'.format(best_f_score))
f, (ax1,ax2) = plt.subplots(nrows=1, ncols=2, sharey=False, figsize=(8,5))
ax1.plot(results['F-Score'])
ax2.plot(results['Best F-Score'])
ax1.set_xlabel('Iterations',fontsize=11)
ax2.set_xlabel('Iterations',fontsize=11)
ax1.set_ylabel('F-Score',fontsize=11)
ax2.set_ylabel('Best F-Score',fontsize=11)
ax1.set_ylim([0.7,0.9])
ax2.set_ylim([0.7,0.9])
plt.tight_layout()
plt.show()
print('\nVariables importance:\n')
p = xgb.plot_importance(best_model)
plt.show()