机器学习-一个半监督演算法-selfLearning-20181225

selfLearning，有时也被称作selfTraining，中文译作自学习，是一个比较简单的半监督演算法框架：

假设我们有标签数据和无标签数据；
1)首先利用有标签数据训练一个模型；

2）利用模型对无标签数据预测，得到无标签数据的概率；
3）设定一个概率阈值（如0.8），将标签为1的概率大于阈值的无标签样本打上1的标签，同理打上0的标签，并从无标签的数据中剔除；4）将3中打上标签的无标签样本加入训练集，重新训练得到模型；

重复2-4步骤，直到没有数据可以加入训练集为止（或达到迭代阈值）；

从上面的步骤中其实可以看出这个演算法的做法，即将预测可能性最大的无标签样本加到训练集中。所以这个演算法的缺陷也很明显：如果前面加入的样本的标签出现偏差，则会导致后面的迭代越来越偏离正确的解，"garbage in,garbage out"，所以这个演算法本身不算是太稳定。

另外，在演算法的第三步，需要设定一个概率阈值，这种做法在均衡标签中是没问题的，但如果是在极度非均衡标签中（1很少），可能会导致迭代的过程中基本没有1的样本加入，完全照搬是有点问题的，这种情况下，按一定比例取首尾的样本可能更合适些，但是迭代的次数就需要自己设定，如下。

from sklearn.base import BaseEstimator import sklearn.metrics #import sys import numpy import numpy as np #from sklearn.linear_model import LogisticRegression as LR from xgboost.sklearn import XGBClassifier from sklearn.metrics import roc_curve, auc import copy import pandas as pd

def auc_score(y_true, predict_proba):

y_true: numpy.ndarray,不能是带索引的series

false_positive_rate, recall, thresholds = roc_curve(y_true,predict_proba)
roc_auc = auc(false_positive_rate, recall)
return roc_auc

class SelfLearningModel(BaseEstimator):
"""
self-training的简版框架；

基础模型需要是一些类sklearn的模型，主要是train和predict的方法等等；

self-training的一些资料：如：http://pages.cs.wisc.edu/~jerryzhu/pub/sslicml07.pdf

Parameters
----------
basemodel : 基模型；
max_iter : int,最大迭代次数；
prob_threshold_pos : float, 即将unlabeled的样本加入训练样本的阈值(正例)；
prob_threshold_neg : float, 即将unlabeled的样本加入训练样本的阈值（负例）;
unlabeled_sample_weight: float, 当无标签样本加入训练时，给予的权重；
top_n：int,即每次加入1标签的样本数
NPratio：float,即每次加入1标签样本与0标签样本数之比，即标签不均衡时需调整
de_ratio：float,即这次加入的样本数与前次的比值，即加入的样本数在逐渐衰减。
"""

def __init__(self, basemodel, max_iter = 200, prob_threshold_pos = 0.8, prob_threshold_neg=0.8, unlabeled_sample_weight=0.8
,top_n=10,NPratio=1,de_ratio=0.9,predictors=[],dev_data=None,stopping_t=30):
self.model = basemodel
self.max_iter = max_iter
self.prob_threshold_pos = prob_threshold_pos
self.prob_threshold_neg = prob_threshold_neg
self.unlabeled_sample_weight = unlabeled_sample_weight
self.top_n=top_n
self.NPratio = NPratio
self.de_ratio = de_ratio
self.predictors = predictors
self.dev_data = dev_data
self.stopping_t = stopping_t

def fit(self, df):
"""
Basemodel的train方法；
df:DataFrame,dep=y,无标签样本的y=-1

Returns
-------
self : returns an instance of self.
"""
X = df[self.predictors]
y_ = df[y]
y = copy.deepcopy(y_)
unlabeledX = X.loc[y==-1, :] #取无标签的变数
labeledX = X.loc[y!=-1, :] #取有标签的变数
labeledy = y[y!=-1] #取有标签的y
sample_weight_ = np.array([1.0]*(X.shape[0]))
sample_weight_[y==-1] = self.unlabeled_sample_weight

self.model.fit(labeledX.values, labeledy.values) #先将有标签的样本进行训练
unlabeledy = self.predict(unlabeledX.values) #对无标签的样本进行标签预测
unlabeledprob = self.predict_proba(unlabeledX.values) #对无标签的样本进行概率进行预测

pos_prob = pd.Series(unlabeledprob[:, 1],index=unlabeledX.index)
neg_prob = pd.Series(unlabeledprob[:, 0],index=unlabeledX.index)

pos_top = pos_prob.sort_values(ascending=False).head(int(self.top_n))
uidx_pos = pos_top.index
neg_top = neg_prob.sort_values(ascending=False).head(int((self.top_n/self.NPratio)))
uidx_neg = neg_top.index
print("pos_top:min_prob=%f,mean_prob=%f"%(pos_top.min(),pos_top.mean()))
print("neg_top:min_prob=%f,mean_prob=%f"%(neg_top.min(),neg_top.mean()))

df_pos = df.loc[uidx_pos,:]
df_neg = df.loc[uidx_neg,:]
# print("pos_top: ss:%i, y_10:%i, y_30:%i"%(df_pos[df_pos[y_10].isin([0,1])].shape[0],df_pos[df_pos[y_10].isin([0,1])][y_10].sum(),df_pos[df_pos[y_10].isin([0,1])][y_30].sum()))
# print("neg_top: ss:%i, y_10:%i, y_30:%i"%(df_neg[df_neg[y_10].isin([0,1])].shape[0],df_neg[df_neg[y_10].isin([0,1])][y_10].sum(),df_neg[df_neg[y_10].isin([0,1])][y_30].sum()))

#uidx_pos = pos_prob[pos_prob > self.prob_threshold_pos].index #unlabeled判断为正例的样本
#uidx_neg = neg_prob[neg_prob > self.prob_threshold_neg].index #unlabeled判断为负例的样本
uidx = np.hstack((uidx_pos, uidx_neg))
self.uidx_pos = {}
self.uidx_pos[0] = uidx_pos

self.uidx_neg = {}
self.uidx_neg[0] = uidx_neg
self.auc = {}
#re-train, labeling unlabeled instances with model predictions, until convergence
i = 0
print (iter: %i, n_pos: %i, n_neg: %i.%(i, uidx_pos.shape[0],uidx_neg.shape[0])) #组合
print(uidx num: ,uidx.shape[0])

#dev_auc
# max_auc_10 = auc_score(self.dev_data[self.dev_data[y_10].isin([0,1])][y_10], self.predict_proba(self.dev_data[self.dev_data[y_10].isin([0,1])][self.predictors].values)[:,1])
# max_auc_30 = auc_score(self.dev_data[self.dev_data[y_30].isin([0,1])][y_30], self.predict_proba(self.dev_data[self.dev_data[y_30].isin([0,1])][self.predictors].values)[:,1])
# stop_t = 0
while (len(uidx)!= 0) and i < self.max_iter :
#当样本不满足阈值或达到迭代阈值时，停止迭代

#部分U重新分配
y[uidx_pos] = 1
y[uidx_neg] = 0
unlabeledX = X.loc[y==-1, :]
labeledX = X.loc[y!=-1, :]
labeledy = y[y!=-1]

#训练新的数据的样本，并加上样本的weight
self.model.fit(labeledX.values, labeledy.values,sample_weight = sample_weight_[y!=-1])
unlabeledprob = self.predict_proba(unlabeledX.values)
pos_prob = pd.Series(unlabeledprob[:, 1],index=unlabeledX.index)
neg_prob = pd.Series(unlabeledprob[:, 0],index=unlabeledX.index)

pos_top = pos_prob.sort_values(ascending=False).head(int(self.top_n*(self.de_ratio**(i+1))))
uidx_pos = pos_top.index
neg_top = neg_prob.sort_values(ascending=False).head(int((self.top_n/self.NPratio)*(self.de_ratio**(i+1))))
uidx_neg = neg_top.index
print("pos_top:min_prob=%f,mean_prob=%f"%(pos_top.min(),pos_top.mean()))
print("neg_top:min_prob=%f,mean_prob=%f"%(neg_top.min(),neg_top.mean()))

# uidx_pos = pos_prob[pos_prob > self.prob_threshold_pos].index #unlabeled判断为正例的样本
# uidx_neg = neg_prob[neg_prob > self.prob_threshold_neg].index #unlabeled判断为负例的样本
uidx = np.hstack((uidx_pos, uidx_neg))
i += 1
print (iter: %i, n_pos: %i, n_neg: %i.%(i, uidx_pos.shape[0],uidx_neg.shape[0])) #组合
print(uidx num: ,uidx.shape[0])

self.uidx_pos[i] = uidx_pos
self.uidx_neg[i] = uidx_neg
# auc_10 = auc_score(self.dev_data[self.dev_data[y_10].isin([0,1])][y_10], self.predict_proba(self.dev_data[self.dev_data[y_10].isin([0,1])][self.predictors].values)[:,1])
# auc_30 = auc_score(self.dev_data[self.dev_data[y_30].isin([0,1])][y_30], self.predict_proba(self.dev_data[self.dev_data[y_30].isin([0,1])][self.predictors].values)[:,1])
# if auc_10>max_auc_10:
# max_auc_10 = auc_10
# stop_t=0
# if auc_30>max_auc_30:
# max_auc_30 = auc_30
# stop_t=0
# else:
# stop_t+=1
# print (auc_10: %f%auc_10)
# print (auc_30: %f%auc_30)
# self.auc[i] = [auc_10,auc_30]
return self