Bert实现文本分类微调Demo
import random
from collections import namedtuple
'''
有四种文本需要做分类,请使用bert处理这个分类问题
'''
Category = namedtuple('Category', ['name', 'samples'])
categories = [
Category('Weather Forecast', ['今天北京晴转多云,气温20-25度。', '明天上海有小雨,记得带伞。']),
Category('Company Financial Report', ['本季度公司净利润增长20%。', '年度财务报告显示,成本控制良好。']),
Category('Company Audit Materials', ['审计发现内部控制存在漏洞。', '审计确认财务报表无重大错报。']),
Category('Product Marketing Ad', ['新口味可乐,清爽上市!', '买一送一,仅限今日。'])
]
def generate_data(num_samples_per_category=50):
'''
生成模拟数据集
输入:
- num_samples_per_category: 每个类别生成的样本数量,默认为50
输出:
- data: 包含文本样本及其对应类别的列表,每项为一个元组(text, label)
'''
data = []
for category in categories:
for _ in range(num_samples_per_category):
sample = random.choice(category.samples)
data.append((sample, category.name))
return data
train_data = generate_data(100)
test_data = generate_data(6)
'''
train_data =
[('明天上海有小雨,记得带伞。', 'Weather Forecast'),
('明天上海有小雨,记得带伞。', 'Weather Forecast'),
('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),
('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),
('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),
('明天上海有小雨,记得带伞。', 'Weather Forecast'),
('明天上海有小雨,记得带伞。', 'Weather Forecast'),
('明天上海有小雨,记得带伞。', 'Weather Forecast'),
('今天北京晴转多云,气温20-25度。', 'Weather Forecast'),]
'''
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn.functional as F
label_map = {category.name: index for index, category in enumerate(categories)}
num_labels = len(categories)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
num_labels=num_labels)
def encode_texts(texts, labels):
encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
label_ids = torch.tensor([label_map[label] for label in labels])
return encodings, label_ids
def prepare_data(data):
texts, labels = zip(*data)
encodings, label_ids = encode_texts(texts, labels)
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], label_ids)
return DataLoader(dataset, batch_size=8, shuffle=True)
train_loader = prepare_data(train_data)
test_loader = prepare_data(test_data)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
def train_epoch(model, data_loader, optimizer):
model.train()
total_loss = 0
for batch in data_loader:
optimizer.zero_grad()
input_ids, attention_mask, labels = batch
input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
optimizer.step()
return total_loss / len(data_loader)
def evaluate(model, data_loader):
model.eval()
total_acc = 0
total_count = 0
with torch.no_grad():
for batch in data_loader:
input_ids, attention_mask, labels = batch
input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
outputs = model(input_ids, attention_mask=attention_mask)
predictions = torch.argmax(outputs.logits, dim=1)
total_acc += (predictions == labels).sum().item()
total_count += labels.size(0)
return total_acc / total_count
optimizer = AdamW(model.parameters(), lr=2e-5)
for epoch in range(3):
train_loss = train_epoch(model, train_loader, optimizer)
acc = evaluate(model, test_loader)
print(f'Epoch {epoch+1}, Train Loss: {train_loss}, Test Accuracy: {acc*100:.2f}%')
def predict(text):
encodings = tokenizer(text, truncation=True, padding=True, return_tensors='pt')
input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
predicted_class_id = torch.argmax(outputs.logits).item()
return categories[predicted_class_id].name
new_text = ["明天的天气怎么样?"]
predicted_category = predict(new_text)
print(f'The predicted category for the new text is: {predicted_category}')