【NLP入门系列五】中文文本分类案例

发布于:2025-07-06 ⋅ 阅读:(21) ⋅ 点赞:(0)

在这里插入图片描述

博主简介:努力学习的22级本科生一枚 🌟​;探索AI算法,C++,go语言的世界;在迷茫中寻找光芒​🌸
博客主页羊小猪~~-CSDN博客
内容简介:这一篇是NLP的入门项目,中文文本分类案例。
🌸箴言🌸:去寻找理想的“天空“”之城
上一篇内容【NLP入门系列四】评论文本分类入门案例-CSDN博客
​💁​​💁​​💁​​💁​: NLP数据格式的构建确实比较难,这里卡住的主要是文本向量化的数据格式

1、数据准备

import pandas as pd
import torchtext
import torch  
import torch.nn as nn 
from torch.utils.data import DataLoader, Dataset 
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# 读取数据
data_df = pd.read_csv("./train.csv", sep='\t', header=None)

# 添加标题
data_df.columns = ["content", "label"]

data_df.head()
content label
0 还有双鸭山到淮阴的汽车票吗13号的 Travel-Query
1 从这里怎么回家 Travel-Query
2 随便播放一首专辑阁楼里的佛里的歌 Music-Play
3 给看一下墓王之王嘛 FilmTele-Play
4 我想看挑战两把s686打突变团竞的游戏视频 Video-Play
t = data_df['label'].unique()
classesNum = len(t)
print("classes: ", t)
print("classes num: ",classesNum)
classes:  ['Travel-Query' 'Music-Play' 'FilmTele-Play' 'Video-Play' 'Radio-Listen'
 'HomeAppliance-Control' 'Weather-Query' 'Alarm-Update' 'Calendar-Query'
 'TVProgram-Play' 'Audio-Play' 'Other']
classes num:  12

2、类别标签化

from sklearn.preprocessing import LabelEncoder 

labels = data_df['label']

# 创建LabelEncoder
label_encoder = LabelEncoder()

# 拟合标签
encoded_list = label_encoder.fit_transform(labels)

# 编码后标签
data_df["labelToNum"] = encoded_list

classes = {}
for name, idx in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    classes[idx] = name

print(classes)
{0: 'Alarm-Update', 1: 'Audio-Play', 2: 'Calendar-Query', 3: 'FilmTele-Play', 4: 'HomeAppliance-Control', 5: 'Music-Play', 6: 'Other', 7: 'Radio-Listen', 8: 'TVProgram-Play', 9: 'Travel-Query', 10: 'Video-Play', 11: 'Weather-Query'}
data_df.head()
content label labelToNum
0 还有双鸭山到淮阴的汽车票吗13号的 Travel-Query 9
1 从这里怎么回家 Travel-Query 9
2 随便播放一首专辑阁楼里的佛里的歌 Music-Play 5
3 给看一下墓王之王嘛 FilmTele-Play 3
4 我想看挑战两把s686打突变团竞的游戏视频 Video-Play 10

3、数据加载与词典构建

数据加载

# 定义数据格式
class MyDataSet(Dataset):
    def __init__(self, dataframe):
        self.labels = dataframe["labelToNum"].tolist()
        self.texts = dataframe["content"].tolist()
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.labels[idx], self.texts[idx]
    
# 加载数据
data = MyDataSet(data_df)
for label, text in data:
    print(label)
    print(text)
    break
9
还有双鸭山到淮阴的汽车票吗13号的

构建词典

import jieba

# 设置中文分词
tokenizer = jieba.lcut

# 返回文本数据中词汇
def yield_tokens(data_iter):
    for _, text in data_iter:  # 注意返回类型
        # 分词
        text = tokenizer(text)
        yield text 
        
# 构建词典
vocab = build_vocab_from_iterator(yield_tokens(data), specials=["<unk>"])

# 设置索引
vocab.set_default_index(vocab["<unk>"])

print("Vocab size:", len(vocab))
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\WY118C~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.456 seconds.
Prefix dict has been built successfully.
Vocab size: 11147

文本向量化

# 向量化函数
text_vector = lambda x : vocab(tokenizer(x))
label_num = lambda x : int(x)

# EmbeddingBag嵌入格式创建
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    
    for (label_, text_) in batch:
        # 标签
        label_list.append(label_num(label_))
        
        # 文本
        temp = torch.tensor(text_vector(text_), dtype=torch.int64)
        text_list.append(temp)
        
        # 偏移量
        offsets.append(temp.size(0))  # 注意:第一个维度哦
    
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list)  # 堆叠, 一个一个维度堆叠,注意:这里易错,这里一定要明白这里的格式 “一行为一个文本”
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    
    return label_list.to(device), text_list.to(device), offsets.to(device)

数据加载

# 分割数据
train_size = int(len(data) * 0.8)
test_size = len(data) - train_size
train_data, test_data = torch.utils.data.random_split(data, [train_size, test_size])

batch_size = 16

# 动态加载
train_dl = DataLoader(
    train_data,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_batch
)

test_dl = DataLoader(
    test_data,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch
)

4、模型构建

class TextModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        
        # 注意:这里是简单入门案例,没用rnn、lstm这些,如果用这些模型这需要用embedding,才能更好捕捉序列信息
        self.embeddingBag = nn.EmbeddingBag(vocab_size,  # 词典大小
                                            embed_dim,   # 嵌入维度
                                            sparse=False)
        
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
    
    # 初始化权重
    def init_weights(self):
        initrange = 0.5
        self.embeddingBag.weight.data.uniform_(-initrange, initrange)  # 初始化权重范围
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()  # 偏置置为0
        
    def forward(self, text, offsets):
        embedding = self.embeddingBag(text, offsets)
        return self.fc(embedding)
vocab_len = len(vocab)
embed_dim = 64  # 嵌入到64维度中
model = TextModel(vocab_size=vocab_len, embed_dim=embed_dim, num_class=classesNum).to(device=device)

5、训练和测试函数

def train(model, dataset, optimizer, loss_fn):
    size = len(dataset.dataset)
    num_batch = len(dataset)
    
    train_acc = 0
    train_loss = 0
    
    for _, (label, text, offset) in enumerate(dataset):
        label, text, offset = label.to(device), text.to(device), offset.to(device)
        
        predict_label = model(text, offset)
        loss = loss_fn(predict_label, label)
        
        # 求导与反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_acc += (predict_label.argmax(1) == label).sum().item()
        train_loss += loss.item()
        
    train_acc /= size 
    train_loss /= num_batch
    
    return train_acc, train_loss

def test(model, dataset, loss_fn):
    size = len(dataset.dataset)
    batch_size = len(dataset)
    
    test_acc, test_loss = 0, 0
    
    with torch.no_grad():
        for _, (label, text, offset) in enumerate(dataset):
            label, text, offset = label.to(device), text.to(device), offset.to(device)
            
            predict = model(text, offset)
            loss = loss_fn(predict, label) 
            
            test_acc += (predict.argmax(1) == label).sum().item()
            test_loss += loss.item()
    
    test_acc /= size 
    test_loss /= batch_size
    
    return test_acc, test_loss

6、模型训练

import copy

# 超参数设置
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.01)  # 动态调整学习率

epochs = 10

train_acc, train_loss, test_acc, test_loss = [], [], [], []

best_acc = 0

for epoch in range(epochs):
    model.train()
    epoch_train_acc, epoch_train_loss = train(model, train_dl, optimizer, loss_fn)
    train_acc.append(epoch_train_acc)
    train_loss.append(epoch_train_loss)
    
    model.eval()
    epoch_test_acc, epoch_test_loss = test(model, test_dl, loss_fn)
    test_acc.append(epoch_test_acc)
    test_loss.append(epoch_test_loss)
    
    if best_acc is not None and epoch_test_acc > best_acc:
        # 动态调整学习率
        scheduler.step()
        best_acc = epoch_test_acc
        best_model = copy.deepcopy(model)  # 保存模型
    
    # 当前学习率
    lr = optimizer.state_dict()['param_groups'][0]['lr']
    
    template = ('Epoch:{:2d}, Train_acc:{:.1f}%, Train_loss:{:.3f}, Test_acc:{:.1f}%, Test_loss:{:.3f}, Lr:{:.2E}')
    print(template.format(epoch+1, epoch_train_acc*100, epoch_train_loss,  epoch_test_acc*100, epoch_test_loss, lr))
    
# 保存最佳模型到文件
path = './best_model.pth'
torch.save(best_model.state_dict(), path) # 保存模型参数
Epoch: 1, Train_acc:61.3%, Train_loss:1.450, Test_acc:77.2%, Test_loss:0.867, Lr:5.00E-01
Epoch: 2, Train_acc:80.4%, Train_loss:0.713, Test_acc:83.1%, Test_loss:0.585, Lr:5.00E-01
Epoch: 3, Train_acc:85.3%, Train_loss:0.516, Test_acc:85.7%, Test_loss:0.477, Lr:5.00E-01
Epoch: 4, Train_acc:88.2%, Train_loss:0.410, Test_acc:87.9%, Test_loss:0.414, Lr:5.00E-01
Epoch: 5, Train_acc:90.4%, Train_loss:0.338, Test_acc:89.3%, Test_loss:0.379, Lr:5.00E-03
Epoch: 6, Train_acc:92.1%, Train_loss:0.293, Test_acc:89.4%, Test_loss:0.378, Lr:5.00E-03
Epoch: 7, Train_acc:92.2%, Train_loss:0.291, Test_acc:89.5%, Test_loss:0.377, Lr:5.00E-03
Epoch: 8, Train_acc:92.2%, Train_loss:0.290, Test_acc:89.4%, Test_loss:0.376, Lr:5.00E-03
Epoch: 9, Train_acc:92.2%, Train_loss:0.289, Test_acc:89.3%, Test_loss:0.376, Lr:5.00E-03
Epoch:10, Train_acc:92.3%, Train_loss:0.288, Test_acc:89.3%, Test_loss:0.375, Lr:5.00E-03

7、结果展示

import matplotlib.pyplot as plt
#隐藏警告
import warnings
warnings.filterwarnings("ignore")               #忽略警告信息
plt.rcParams['font.sans-serif']    = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False      # 用来正常显示负号
plt.rcParams['figure.dpi']         = 100        #分辨率

epoch_length = range(epochs)

plt.figure(figsize=(12, 3))

plt.subplot(1, 2, 1)
plt.plot(epoch_length, train_acc, label='Train Accuaray')
plt.plot(epoch_length, test_acc, label='Test Accuaray')
plt.legend(loc='lower right')
plt.title('Accurary')

plt.subplot(1, 2, 2)
plt.plot(epoch_length, train_loss, label='Train Loss')
plt.plot(epoch_length, test_loss, label='Test Loss')
plt.legend(loc='upper right')
plt.title('Loss')

plt.show()


在这里插入图片描述

8、结果测试

model.load_state_dict(torch.load("./best_model.pth"))
model.eval() # 模型评估

# 测试句子
test_sentence = "还有双鸭山到淮阴的汽车票吗13号的"

# 转换为 token
token_ids = vocab(tokenizer(test_sentence))   # 切割分词--> 词典序列
text = torch.tensor(token_ids, dtype=torch.long).to(device)  # 转化为tensor
offsets = torch.tensor([0], dtype=torch.long).to(device)

# 测试,注意:不需要反向求导
with torch.no_grad():
    output = model(text, offsets)
    predicted_label = output.argmax(1).item()

print(f"预测类别: {classes[predicted_label]}")
预测类别: Travel-Query