bert文本分类微调笔记

发布于:2024-06-23 ⋅ 阅读:(64) ⋅ 点赞:(0)

Bert实现文本分类微调Demo

import random
from collections import namedtuple


'''
有四种文本需要做分类,请使用bert处理这个分类问题
'''

# 使用namedtuple定义一个类别(Category),包含两个字段:名称(name)和样例(samples)
Category = namedtuple('Category', ['name', 'samples'])

# 定义四个不同的类别及其对应的样例文本
categories = [
    Category('Weather Forecast', ['今天北京晴转多云,气温20-25度。', '明天上海有小雨,记得带伞。']),  # 天气预报类别的样例
    Category('Company Financial Report', ['本季度公司净利润增长20%。', '年度财务报告显示,成本控制良好。']),  # 公司财报类别的样例
    Category('Company Audit Materials', ['审计发现内部控制存在漏洞。', '审计确认财务报表无重大错报。']),  # 公司审计材料类别的样例
    Category('Product Marketing Ad', ['新口味可乐,清爽上市!', '买一送一,仅限今日。'])  # 产品营销广告类别的样例
]

def generate_data(num_samples_per_category=50):
    ''' 
    生成模拟数据集
    
    输入:
    - num_samples_per_category: 每个类别生成的样本数量,默认为50
    
    输出:
    - data: 包含文本样本及其对应类别的列表,每项为一个元组(text, label)
    '''
    data = []  # 初始化存储数据的列表
    for category in categories:  # 遍历所有类别
        for _ in range(num_samples_per_category):  # 对每个类别生成指定数量的样本
            sample = random.choice(category.samples)  # 从该类别的样例中随机选择一条文本
            data.append((sample, category.name))  # 将文本及其类别添加到data列表中
    return data

# 调用generate_data函数生成模拟数据集
train_data = generate_data(100)  # 为每个类别生成100个训练样本
test_data = generate_data(6)     # 生成少量(6个)测试样本用于演示



'''
train_data = 
[('明天上海有小雨,记得带伞。', 'Weather Forecast'),
 ('明天上海有小雨,记得带伞。', 'Weather Forecast'),
 ('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),
 ('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),
 ('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),
 ('明天上海有小雨,记得带伞。', 'Weather Forecast'),
 ('明天上海有小雨,记得带伞。', 'Weather Forecast'),
 ('明天上海有小雨,记得带伞。', 'Weather Forecast'),
 ('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),]
'''








from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn.functional as F

# 步骤1: 定义类别到标签的映射
label_map = {category.name: index for index, category in enumerate(categories)}
num_labels = len(categories)  # 类别总数

# 步骤2: 初始化BERT分词器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', 
                                                       num_labels=num_labels)

# 步骤3: 准备数据集
def encode_texts(texts, labels):
    # 对文本进行编码,得到BERT模型需要的输入格式
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
    # 将标签名称转换为对应的索引
    label_ids = torch.tensor([label_map[label] for label in labels])
    return encodings, label_ids

def prepare_data(data):
    texts, labels = zip(*data)  # 解压数据
    encodings, label_ids = encode_texts(texts, labels)  # 编码数据
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], label_ids)  # 创建数据集
    return DataLoader(dataset, batch_size=8, shuffle=True)  # 创建数据加载器

# 步骤4: 准备训练和测试数据
train_loader = prepare_data(train_data)
test_loader = prepare_data(test_data)

# 步骤5: 定义训练和评估函数
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(data_loader)

def evaluate(model, data_loader):
    model.eval()
    total_acc = 0
    total_count = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            total_acc += (predictions == labels).sum().item()
            total_count += labels.size(0)
    return total_acc / total_count

# 步骤6: 训练模型
optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):  # 训练3个epoch
    train_loss = train_epoch(model, train_loader, optimizer)
    acc = evaluate(model, test_loader)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss}, Test Accuracy: {acc*100:.2f}%')

# 步骤7: 使用微调后的模型进行预测
def predict(text):
    encodings = tokenizer(text, truncation=True, padding=True, return_tensors='pt')
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_class_id = torch.argmax(outputs.logits).item()
    
    return categories[predicted_class_id].name

# 预测一个新文本
new_text = ["明天的天气怎么样?"]  # 注意这里是一个列表
predicted_category = predict(new_text)
print(f'The predicted category for the new text is: {predicted_category}')




网站公告

今日签到

点亮在社区的每一天
去签到