将自定义的频数编码处理整合到sklearn的pipeline流程里面:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures # 多项式
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import lightgbm as lgb
import pandas as pd
def load_data(path):
data = pd.read_csv(path,usecols=lambda col: col != 'id')
data['subscribe'] = data['subscribe'].apply(lambda x: 1 if x == 'yes' else 0,)
return data
# 自定义转换器1 将类别特征按频次编码
class Freqencode(BaseEstimator, TransformerMixin):
def __init__(self, cat_cols=[]):
self.cat_cols = cat_cols
# 返回对象本身
def fit(self, X, y=None):
# 计算统计量
return self
# 转换数据
def transform(self, X):
# 数据转换逻辑
for col in self.cat_cols:
freq = X[col].value_counts(normalize=True).to_dict()
X[col] = X[col].map(freq)
return X
def pipeline_model(cat_cols):
pip_model = Pipeline(steps=[
('freq_encode', Freqencode(cat_cols=cat_cols)),
('imputer', SimpleImputer(strategy='mean')),
('poly', PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
('model', lgb.LGBMClassifier(verbose=-1)),
])
return pip_model
if __name__ == '__main__':
path = r"C:\Users\12048\Desktop\python_code\data\train.csv"
data = load_data(path)
# 类别特征
cat_cols = list(data.select_dtypes(include=['object']).columns)
x, y = data.drop(labels='subscribe', axis=1), data['subscribe']
pip_model = pipeline_model(cat_cols)
pip_model.fit(x, y)
print('训练集表现:')
prob = pip_model.predict_proba(x)[:,1]
train_pred = [1 if i>0.5 else 0 for i in prob]
print('混淆矩阵:\n',confusion_matrix(y, train_pred))
print('模型报告:\n',classification_report(y, train_pred))
print('auc:',roc_auc_score(y, prob))