✨做一次按NLP项目常见工具的使用拆解
1. tokenizer
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')
text_sample = "We're going on an adventure! The weather is really nice today."
tokens = tokenizer(text_sample)
print(tokens)
['we', "'", 're', 'going', 'on', 'an', 'adventure', '!', 'the', 'weather', 'is', 'really', 'nice', 'today', '.']
2. vocab
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
# 创建分词器
tokenizer = get_tokenizer('basic_english')
# 测试数据
test_sentences = [
"The quick brown fox jumps over the lazy dog.",
"Hello world! This is a test for building vocabulary.",
]
vocab = build_vocab_from_iterator(
(tokenizer(sentence) for sentence in test_sentences),
specials=['<unk>', '<pad>'],
min_freq=1 # 设置最小频率为1
)
vocab.set_default_index(vocab['<unk>'])
print("词表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])
词表大小: 21
'fox'的索引: 10
3. Dataloader(示例1)
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
# 1. 创建分词器
tokenizer = get_tokenizer('basic_english')
# 2. 测试数据
train_sentences = [
"The quick brown fox jumps over the lazy dog.",
"Hello world! This is a test for building vocabulary.",
# 你可以在这里添加更多训练句子
]
test_sentences = [
"The quick brown fox jumps over the lazy dog.",
"Hello world! This is a test for building vocabulary.",
]
# 3. 构建词表
vocab = build_vocab_from_iterator(
(tokenizer(sentence) for sentence in train_sentences),
specials=['<unk>', '<pad>'],
min_freq=1
)
vocab.set_default_index(vocab['<unk>'])
print("词表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])
# 4. 自定义 Dataset
class TextDataset(Dataset):
def __init__(self, sentences, vocab, tokenizer):
self.sentences = sentences
self.vocab = vocab
self.tokenizer = tokenizer
def __len__(self):
return len(self.sentences)
def __getitem__(self, idx):
tokens = self.tokenizer(self.sentences[idx])
indices = [self.vocab[token] for token in tokens]
return torch.tensor(indices, dtype=torch.long)
# 5. 创建 Dataset 实例
train_dataset = TextDataset(train_sentences, vocab, tokenizer)
test_dataset = TextDataset(test_sentences, vocab, tokenizer)
# 6. DataLoader 与 Padding Collate 函数
def collate_fn(batch):
# batch 是一个 list of tensors
return pad_sequence(batch, batch_first=True, padding_value=vocab['<pad>'])
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
# 7. 测试 DataLoader 输出
print("\n=== Train Batch Indices ===")
for batch in train_loader:
print(batch)
break
print("\n=== Test Batch Indices ===")
for batch in test_loader:
print(batch)
break
=== Train Batch Indices ===
tensor([[11, 20, 4, 18, 12, 5, 17, 9, 7, 19, 2],
[ 3, 16, 6, 10, 13, 15, 3, 14, 8, 2, 1]])=== Test Batch Indices ===
tensor([[ 3, 16, 6, 10, 13, 15, 3, 14, 8, 2, 1],
[11, 20, 4, 18, 12, 5, 17, 9, 7, 19, 2]])
4. Dataloader(示例2)
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
# 1. 创建分词器
tokenizer = get_tokenizer('basic_english')
# 2. 带标签的训练与测试数据 (句子, 标签)
train_data = [
("The quick brown fox jumps over the lazy dog.", 1), # 正面情感
("Hello world! This is a test for building vocabulary.", 0), # 负面情感
# 可添加更多 (sentence, label)
]
test_data = [
("The quick brown fox jumps over the lazy dog.", 1),
("Hello world! This is a test for building vocabulary.", 0),
]
# 3. 构建词表,只基于训练数据中的句子
vocab = build_vocab_from_iterator(
(tokenizer(sentence) for sentence, _ in train_data),
specials=['<unk>', '<pad>'],
min_freq=1
)
vocab.set_default_index(vocab['<unk>'])
print("词表大小:", len(vocab))
print("'fox'的索引:", vocab['fox'])
# 4. 自定义 Dataset,返回 (indices_tensor, label_tensor)
class TextDataset(Dataset):
def __init__(self, data, vocab, tokenizer):
self.data = data
self.vocab = vocab
self.tokenizer = tokenizer
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
sentence, label = self.data[idx]
tokens = self.tokenizer(sentence)
indices = [self.vocab[token] for token in tokens]
return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)
# 5. Padding 与 collate_fn
def collate_fn(batch):
sequences, labels = zip(*batch)
padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=vocab['<pad>'])
labels_tensor = torch.stack(labels)
return padded_seqs, labels_tensor
# 6. 创建 DataLoader
train_dataset = TextDataset(train_data, vocab, tokenizer)
test_dataset = TextDataset(test_data, vocab, tokenizer)
train_loader = DataLoader(
train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn
)
# 7. 测试输出
print("\n=== Train Batch ===")
for seq_batch, label_batch in train_loader:
print("Sequences:", seq_batch)
print("Labels: ", label_batch)
break
print("\n=== Test Batch ===")
for seq_batch, label_batch in test_loader:
print("Sequences:", seq_batch)
print("Labels: ", label_batch)
break