例如随机森林模型: # 1. 导入需要用到的相关库
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# 2. 读取训练集和测试集
train = pd.read_excel('./data/data280993/traindata-new.xlsx')
test = pd.read_excel('./data/data280993/testdata-new.xlsx')
# 3. 特征工程
# 3.1 删除train数据中的 'DC50 (nM)' 和 'Dmax (%)' 列
train = train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1)
# 3.2 处理分类数据
# 使用LabelEncoder对分类特征进行编码
label_encoders = {}
for col in train.columns[2:]: # 跳过前两列('No' 和 'uuid')
if train[col].dtype == object:
# 合并训练集和测试集的类别
combined_data = pd.concat([train[col], test[col]])
le = LabelEncoder()
combined_data = le.fit_transform(combined_data)
train[col] = combined_data[:len(train)]
test[col] = combined_data[len(train):]
label_encoders[col] = le # 保存编码器以备后用
# 检查测试集中非数值类型的特征
non_numeric_cols = test.select_dtypes(include=['object']).columns.tolist()
# 处理非数值类型的特征
for col in non_numeric_cols:
# 如果该特征应该在模型中,确保它是数值类型的
if col in train.columns[2:]:
# 尝试将字符串转换为数值类型
test[col] = pd.to_numeric(test[col], errors='coerce')
# 确保测试集中没有缺失值
test = test.fillna(0)
# 重新训练模型并预测
rf_model.fit(X_train, y_train)
pred = rf_model.predict(test.iloc[:, 1:].values)
result = pd.DataFrame({'uuid': test['uuid'], 'Label': pred})
# 4. 加载随机森林模型进行训练
X_train = train.iloc[:, 2:].values
y_train = train['Label'].values
X_test = test.iloc[:, 1:].values
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# 5. 使用模型进行预测并保存结果
pred = rf_model.predict(X_test)
result = pd.DataFrame({'uuid': test['uuid'], 'Label': pred})
# 保存结果文件到本地
result.to_csv('submit_rf.csv', index=False)
# 0.73905
# 1. 导入需要用到的相关库
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
# 2. 读取训练集和测试集
train = pd.read_excel('./data/data280993/traindata-new.xlsx')
test = pd.read_excel('./data/data280993/testdata-new.xlsx')
# 3. 特征工程
# 3.1 删除train数据中的 'DC50 (nM)' 和 'Dmax (%)' 列
train = train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1)
# 3.2 处理分类数据
# 使用LabelEncoder对分类特征进行编码
label_encoders = {}
for col in train.columns[2:]: # 跳过前两列('No' 和 'uuid')
if train[col].dtype == object:
# 合并训练集和测试集的类别
combined_data = pd.concat([train[col], test[col]])
le = LabelEncoder()
combined_data = le.fit_transform(combined_data)
train[col] = combined_data[:len(train)]
test[col] = combined_data[len(train):]
label_encoders[col] = le # 保存编码器以备后用
# 检查测试集中非数值类型的特征
non_numeric_cols = test.select_dtypes(include=['object']).columns.tolist()
# 处理非数值类型的特征
for col in non_numeric_cols:
# 如果该特征应该在模型中,确保它是数值类型的
if col in train.columns[2:]:
# 尝试将字符串转换为数值类型
test[col] = pd.to_numeric(test[col], errors='coerce')
# 确保测试集中没有缺失值
test = test.fillna(0)
# 重新训练模型并预测
rf_model.fit(X_train, y_train)
pred = rf_model.predict(test.iloc[:, 1:].values)
result = pd.DataFrame({'uuid': test['uuid'], 'Label': pred})
# 4. 加载随机森林模型进行训练
X_train = train.iloc[:, 2:].values
y_train = train['Label'].values
X_test = test.iloc[:, 1:].values
# 4. 特征选择
# 使用随机森林模型进行特征选择
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
selector.fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)
# 5. 模型调优
# 定义随机森林模型的参数网格
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
}
# 使用网格搜索进行模型调优
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train_selected, y_train)
# 获取最佳参数模型
best_rf_model = grid_search.best_estimator_
# 6. 使用最佳模型进行预测
pred = best_rf_model.predict(X_test_selected)
result = pd.DataFrame({'uuid': test['uuid'], 'Label': pred})
# # 5. 使用模型进行预测并保存结果
# pred = rf_model.predict(X_test)
# result = pd.DataFrame({'uuid': test['uuid'], 'Label': pred})
# 保存结果文件到本地
result.to_csv('submit_rf.csv', index=False)
# 0.74296