基于 Python 的信用评分模型开发-附数据和代码
数据管道
共 4294字,需浏览 9分钟
·
2020-08-31 17:44
点击上方“数据管道”,选择“置顶星标”公众号
干货福利,第一时间送达
#载入数据
data = pd.read_csv('cs-training.csv')
#数据集确实和分布情况
data.describe().to_csv('DataDescribe.csv')
3.1缺失值处理
1.直接删除含有缺失值的样本。
2.根据样本之间的相似性填补缺失值。
3.根据变量之间的相关关系填补缺失值。
变量 MonthlyIncome 缺失率比较大,所以我们根据变量之间的相关关系填补缺失值,我们采用随机森林法:
# 用随机森林对缺失值预测填充函数
def set_missing(df):
# 把已有的数值型特征取出来
process_df = df.ix[:,[5,0,1,2,3,4,6,7,8,9]]
# 分成已知该特征和未知该特征两部分
known = process_df[process_df.MonthlyIncome.notnull()].as_matrix()
unknown = process_df[process_df.MonthlyIncome.isnull()].as_matrix()
# X为特征属性值
X = known[:, 1:]
# y为结果标签值
y = known[:, 0]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0,
n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X,y)
# 用得到的模型进行未知特征值预测
predicted = rfr.predict(unknown[:, 1:]).round(0)
print(predicted)
# 用得到的预测结果填补原缺失数据
df.loc[(df.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted
return df
data=set_missing(data)#用随机森林填补比较多的缺失值
data=data.dropna()#删除比较少的缺失值
data = data.drop_duplicates()#删除重复项
data.to_csv('MissingData.csv',index=False)
3.2异常值处理
# 年龄等于0的异常值进行剔除
data = data[data['age'] > 0]
#剔除异常值
data = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 90]
#变量SeriousDlqin2yrs取反
data['SeriousDlqin2yrs']=1-data['SeriousDlqin2yrs']
3.3数据切分
from sklearn.cross_validation import train_test_splitY = data['SeriousDlqin2yrs']
X = data.ix[:, 1:]
#测试集占比30%
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
# print(Y_train)
train = pd.concat([Y_train, X_train], axis=1)
test = pd.concat([Y_test, X_test], axis=1)
clasTest = test.groupby('SeriousDlqin2yrs')['SeriousDlqin2yrs'].count()
train.to_csv('TrainData.csv',index=False)
test.to_csv('TestData.csv',index=False)
客户年龄分布如图4-1所示,可以看到年龄变量大致呈正态分布,符合统计分析的假设。
在本文中,我们采用信用评分模型的变量选择方法,通过 WOE分析方法,即是通过比较指标分箱和对应分箱的违约概率来确定指标是否符合经济意义。首先我们对变量进行离散化(分箱)处理。
5.1分箱处理
我们首先选择对连续变量进行最优分段,在连续变量的分布不满足最优分段的要求时,再考虑对连续变量进行等距分段。最优分箱的代码如下:
# 定义自动分箱函数def mono_bin(Y, X, n = 20):
r = 0
good=Y.sum()
bad=Y.count() - good
while np.abs(r) < 1:
d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
d2 = d1.groupby('Bucket', as_index = True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
d3['min']=d2.min().X
d3['max'] = d2.max().X
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['rate'] = d2.mean().Y
d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad))
d4 = (d3.sort_index(by = 'min')).reset_index(drop=True)
print("=" * 60)
print(d4)
return d4
# 连续变量离散化
cutx3 = [ninf, 0, 1, 3, 5, pinf]
cutx6 = [ninf, 1, 2, 3, 5, pinf]
cutx7 = [ninf, 0, 1, 3, 5, pinf]
cutx8 = [ninf, 0,1,2, 3, pinf]
cutx9 = [ninf, 0, 1, 3, pinf]
cutx10 = [ninf, 0, 1, 2, 3, 5, pinf]
5.2WOE
woe=ln(goodattribute/badattribute)
在进行分析时,我们需要对各指标从小到大排列,并计算出相应分档的 WOE 值。其中正向指标越大,WOE 值越小;反向指标越大,WOE 值越大。正向指标的 WOE 值负斜率越大,反响指标的正斜率越大,则说明指标区分能力好。WOE 值趋近于直线,则意味指标判断能力较弱。若正向指标和 WOE 正相关趋势、反向指标同 WOE 出现负相关趋势,则说明此指标不符合经济意义,则应当予以去除。
WOE函数实现在上一节的mono_bin()函数里面已经包含,这里不再重复。
5.3相关性分析和IV筛选
相关性图我们通过Python里面的seaborn包,调用heatmap()绘图函数进行绘制,实现代码如下:
corr = data.corr()
#计算各变量的相关性系数
xticks = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']#x轴标签
yticks = list(corr.index)
#y轴标签
fig = plt.figure()ax1 = fig.add_subplot(1, 1, 1)sns.heatmap(corr, annot=True, cmap='rainbow', ax=ax1, annot_kws={'size': 9, 'weight': 'bold', 'color': 'blue'})
#绘制相关性系数热力图
ax1.set_xticklabels(xticks, rotation=0, fontsize=10)ax1.set_yticklabels(yticks, rotation=0, fontsize=10)plt.show()
# 定义自动分箱函数def mono_bin(Y, X, n = 20):
r = 0
good=Y.sum()
bad=Y.count()-good
while np.abs(r) < 1:
d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
d2 = d1.groupby('Bucket', as_index = True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
d3['min']=d2.min().X
d3['max'] = d2.max().X
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['rate'] = d2.mean().Y
d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad))
d3['goodattribute']=d3['sum']/good
d3['badattribute']=(d3['total']-d3['sum'])/bad
iv=((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
d4 = (d3.sort_index(by = 'min')).reset_index(drop=True)
print("=" * 60)
print(d4)
cut=[]
cut.append(float('-inf'))
for i in range(1,n+1):
qua=X.quantile(i/(n+1))
cut.append(round(qua,4))
cut.append(float('inf'))
woe=list(d4['woe'].round(3))
return d4,iv,cut,woe
ivlist=[ivx1,ivx2,ivx3,ivx4,ivx5,ivx6,ivx7,ivx8,ivx9,ivx10]
#各变量
IVindex=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
#x轴的标签
fig1 = plt.figure(1)ax1 = fig1.add_subplot(1, 1, 1)x = np.arange(len(index))+1ax1.bar(x, ivlist, width=0.4)
#生成柱状图
ax1.set_xticks(x)ax1.set_xticklabels(index, rotation=0, fontsize=12)ax1.set_ylabel('IV(Information Value)', fontsize=14)
#在柱状图上添加数字标签
for a, b in zip(x, ivlist):
plt.text(a, b + 0.01, '%.4f' % b, ha='center', va='bottom', fontsize=10)plt.show()
6.1WOE转换
#替换成woe函数def replace_woe(series, cut, woe):
list = []
I = 0
while ivalue=series[i]
j=len(cut) - 2
m=len(cut) - 2
while j >= 0:
if value>=cut[j]:
j = -1
else:
j -= 1
m -= 1
list.append(woe[m])
i += 1
return list
# 替换成
woedata['RevolvingUtilizationOfUnsecuredLines'] = Series(replace_woe(data['RevolvingUtilizationOfUnsecuredLines'], cutx1, woex1))
data['age'] = Series(replace_woe(data['age'], cutx2, woex2))
data['NumberOfTime30-59DaysPastDueNotWorse'] = Series(replace_woe(data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, woex3))
data['DebtRatio'] = Series(replace_woe(data['DebtRatio'], cutx4, woex4))
data['MonthlyIncome'] = Series(replace_woe(data['MonthlyIncome'], cutx5, woex5))
data['NumberOfOpenCreditLinesAndLoans'] = Series(replace_woe(data['NumberOfOpenCreditLinesAndLoans'], cutx6, woex6))
data['NumberOfTimes90DaysLate'] = Series(replace_woe(data['NumberOfTimes90DaysLate'], cutx7, woex7))
data['NumberRealEstateLoansOrLines'] = Series(replace_woe(data['NumberRealEstateLoansOrLines'], cutx8, woex8))
data['NumberOfTime60-89DaysPastDueNotWorse'] = Series(replace_woe(data['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, woex9))
data['NumberOfDependents'] = Series(replace_woe(data['NumberOfDependents'], cutx10, woex10))
data.to_csv('WoeData.csv', index=False)
6.2Logisic模型建立
导入数据data = pd.read_csv('WoeData.csv')
#应变量
Y=data['SeriousDlqin2yrs']
#自变量,剔除对因变量影响不明显的变量
X=data.drop(['SeriousDlqin2yrs','DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1)
X1=sm.add_constant(X)
logit=sm.Logit(Y,X1)
result=logit.fit()
print(result.summary())
6.3模型检验
#应变量
Y_test = test['SeriousDlqin2yrs']
#自变量,剔除对因变量影响不明显的变量,与模型变量对应
X_test = test.drop(['SeriousDlqin2yrs', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines', 'NumberOfDependents'], axis=1)
X3 = sm.add_constant(X_test)
resu = result.predict(X3)
#进行预测
fpr, tpr, threshold = roc_curve(Y_test, resu)
rocauc = auc(fpr, tpr)
#计算
AUCplt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
#生成ROC曲线
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()
7.1评分标准
# 我们取600分为基础分值,PDO为20(每高20分好坏比翻一倍),好坏比取20。
z = 20 / math.log(2)
q = 600 - 20 * math.log(20) / math.log(2)
baseScore = round(q + p * coe[0], 0)
7.2部分评分
#计算分数函数 def get_score(coe,woe,factor):
scores=[]
for w in woe:
score=round(coe*w*factor,0)
scores.append(score)
return scores
# 各项部分分数
x1 = get_score(coe[1], woex1, p)
x2 = get_score(coe[2], woex2, p)
x3 = get_score(coe[3], woex3, p)
x7 = get_score(coe[4], woex7, p)
x9 = get_score(coe[5], woex9, p)
#根据变量计算分数
def compute_score(series,cut,score):
list = []
i = 0
while i < len(series):
value = series[i]
j = len(cut) - 2
m = len(cut) - 2
while j >= 0:
if value >= cut[j]:
j = -1
else:
j -= 1
m -= 1
list.append(score[m])
i += 1
return list
test1 = pd.read_csv('TestData.csv')
test1['BaseScore']=Series(np.zeros(len(test1)))+baseScore
test1['x1'] = Series(compute_score(test1['RevolvingUtilizationOfUnsecuredLines'], cutx1, x1))
test1['x2'] = Series(compute_score(test1['age'], cutx2, x2))
test1['x3'] = Series(compute_score(test1['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3))
test1['x7'] = Series(compute_score(test1['NumberOfTimes90DaysLate'], cutx7, x7)
test1['x9'] = Series(compute_score(test1['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, x9))
test1['Score'] = test1['x1'] + test1['x2'] + test1['x3'] + test1['x7'] +test1['x9'] + baseScore
test1.to_csv('ScoreData.csv', index=False)
基于 AI 的机器学习评分卡系统可通过把旧数据(某个时间点后,例如2年)剔除掉后再进行自动建模、模型评估、并不断优化特征变量,可以使系统更加强大。
评论