PyTorch实战:从零构建并训练自定义语料库的完整指南
一、引言:为什么需要自定义语料库?
在NLP任务中,公开数据集(如IMDB影评、Wikipedia)虽易获取,但往往无法满足特定场景需求(如行业术语、方言、垂直领域知识)。通过构建自定义语料库,可以:
训练针对特定领域的语言模型(如医疗、法律文本)。
解决数据隐私问题(如企业内部数据无法公开)。
提升模型对小众语言的适应性。
本文以情感分析任务为例,演示如何用PyTorch训练一个基于自定义语料库的分类模型。
二、构建自定义语料库的完整流程
1. 数据收集与准备
数据来源
公开数据:爬取垂直领域网站(如知乎行业话题、豆瓣书评)。
私有数据:企业日志、客服对话记录(需脱敏处理)。
人工标注:使用工具如
Label Studio标注情感标签(积极/消极)。
数据格式
将文本和标签保存为CSV文件,示例:
text,label "这个产品非常好用",1 "服务态度太差了",0
2. 文本预处理
步骤1:加载数据
import pandas as pd
from sklearn.model_selection import train_test_split
# 读取数据
df = pd.read_csv("custom_corpus.csv")
texts = df["text"].tolist()
labels = df["label"].tolist()
# 划分训练集/验证集/测试集
train_texts, val_texts, train_labels, val_labels = train_test_split(
texts, labels, test_size=0.2, random_state=42
)步骤2:文本清洗
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
def preprocess_text(text):
# 转换为小写
text = text.lower()
# 移除特殊字符和数字
text = re.sub(r"[^a-zA-Z\s]", "", text)
# 分词并移除停用词
tokens = word_tokenize(text)
stop_words = set(stopwords.words("english"))
tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
return " ".join(tokens)
# 应用清洗
train_texts_clean = [preprocess_text(text) for text in train_texts]
val_texts_clean = [preprocess_text(text) for text in val_texts]步骤3:构建词汇表
from collections import Counter
# 统计词频并构建词汇表
vocab = Counter()
for text in train_texts_clean:
vocab.update(text.split())
# 定义词汇表大小和特殊符号
vocab_size = 10000 # 仅保留前10000个高频词
UNK_TOKEN = "<UNK>" # 未知词
PAD_TOKEN = "<PAD>" # 填充符
# 生成词汇表到索引的映射
word2idx = {word: idx+2 for idx, word in enumerate(list(vocab.keys())[:vocab_size-2])}
word2idx[UNK_TOKEN] = 0
word2idx[PAD_TOKEN] = 1
# 索引到词汇表的映射
idx2word = {idx: word for word, idx in word2idx.items()}步骤4:文本向量化
def text_to_sequence(text, word2idx, max_len=50): tokens = text.split() sequence = [word2idx.get(token, word2idx[UNK_TOKEN]) for token in tokens] # 填充或截断至固定长度 if len(sequence) < max_len: sequence += [word2idx[PAD_TOKEN]] * (max_len - len(sequence)) else: sequence = sequence[:max_len] return sequence # 转换所有文本 max_len = 50 X_train = [text_to_sequence(text, word2idx, max_len) for text in train_texts_clean] X_val = [text_to_sequence(text, word2idx, max_len) for text in val_texts_clean] # 转换为PyTorch张量 import torch X_train_tensor = torch.LongTensor(X_train) X_val_tensor = torch.LongTensor(X_val) y_train_tensor = torch.LongTensor(train_labels) y_val_tensor = torch.LongTensor(val_labels)
3. 定义神经网络模型
模型架构
使用TextCNN(卷积神经网络)进行文本分类:
import torch.nn as nn import torch.nn.functional as F class TextCNN(nn.Module): def __init__(self, vocab_size, embed_dim, num_classes, kernel_sizes=[3, 4, 5]): super(TextCNN, self).__init__() self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=1) self.convs = nn.ModuleList([ nn.Conv2d(1, 1, (k, embed_dim)) for k in kernel_sizes ]) self.fc = nn.Linear(len(kernel_sizes), num_classes) def forward(self, x): # x shape: [batch_size, seq_len] x = self.embedding(x) # [batch_size, seq_len, embed_dim] x = x.unsqueeze(1) # [batch_size, 1, seq_len, embed_dim] # 通过不同卷积核 conv_outputs = [] for conv in self.convs: out = conv(x) # [batch_size, 1, seq_len-k+1, 1] out = F.relu(out.squeeze(3)) # [batch_size, 1, seq_len-k+1] out = F.max_pool1d(out, out.size(2)) # [batch_size, 1, 1] conv_outputs.append(out.squeeze(2)) # 拼接所有卷积结果 x = torch.cat(conv_outputs, dim=1) # [batch_size, num_filters] x = self.fc(x) # [batch_size, num_classes] return x
初始化模型
pythonvocab_size = len(word2idx)embed_dim = 128num_classes = 2 # 二分类model = TextCNN(vocab_size, embed_dim, num_classes)
4. 训练模型
定义损失函数和优化器
import torch.optim as optim criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001)
训练循环
def train_model(model, X_train, y_train, X_val, y_val, epochs=10, batch_size=32):
model.train()
for epoch in range(epochs):
total_loss = 0
for i in range(0, len(X_train), batch_size):
# 获取批次数据
batch_X = X_train[i:i+batch_size]
batch_y = y_train[i:i+batch_size]
# 前向传播
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
# 验证集评估
val_loss, val_acc = evaluate_model(model, X_val, y_val, batch_size)
print(f"Epoch {epoch+1}, Train Loss: {total_loss/(len(X_train)/batch_size):.4f}, "
f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
def evaluate_model(model, X_val, y_val, batch_size):
model.eval()
total_loss = 0
correct = 0
total = 0
with torch.no_grad():
for i in range(0, len(X_val), batch_size):
batch_X = X_val[i:i+batch_size]
batch_y = y_val[i:i+batch_size]
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
total_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
accuracy = correct / total
return total_loss/(len(X_val)/batch_size), accuracy
# 启动训练
train_model(model, X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor)5. 模型评估与优化
评估指标
准确率(Accuracy):正确分类样本占比。
F1分数:平衡精确率和召回率(适用于类别不平衡数据)。
优化方向
超参数调优:调整学习率、批次大小、卷积核大小。
模型改进:尝试LSTM、BERT等更复杂的架构。
数据增强:同义词替换、回译(Back Translation)扩充数据集。
三、总结与扩展
关键点回顾
数据预处理:清洗、分词、构建词汇表是基础。
模型设计:TextCNN适合短文本分类,LSTM/Transformer适合长文本。
训练技巧:批量训练、学习率调度、早停(Early Stopping)防止过拟合。
扩展应用
多语言支持:使用
sentencepiece分词器处理非空格分隔语言(如中文)。部署上线:将模型导出为
TorchScript格式,集成到Flask/FastAPI后端。

