基于Python的Word2Vec和GloVe实例
以下是一些基于Python的Word2Vec和GloVe实现示例,涵盖训练、可视化、应用等多个方面。示例代码基于gensim
、glove-python
等库,可直接运行或调整参数使用。
训练Word2Vec模型
from gensim.models import Word2Vec
sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
加载预训练Word2Vec模型
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
print(model.most_similar("king"))
训练GloVe模型
from glove import Corpus, Glove
corpus = Corpus()
texts = [["hello", "world"], ["machine", "learning"]]
corpus.fit(texts, window=10)
glove = Glove(no_components=100)
glove.fit(corpus.matrix, epochs=30)
glove.add_dictionary(corpus.dictionary)
词向量可视化(PCA降维)
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
words = ["king", "queen", "man", "woman"]
vectors = [model[w] for w in words]
pca = PCA(n_components=2)
result = pca.fit_transform(vectors)
plt.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(words):
plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()
计算词语相似度
similarity = model.similarity("woman", "man")
print(f"Cosine similarity: {similarity:.4f}")
词语类比推理
result = model.most_similar(positive=["woman", "king"], negative=["man"])
print(f"woman + king - man = {result[0][0]} (score: {result[0][1]:.4f})")
查找不相关词语
odd_one = model.doesnt_match(["breakfast", "cereal", "dinner", "lunch"])
print(f"Odd one out: {odd_one}")
增量训练Word2Vec
new_sentences = [["python", "programming"], ["java", "coding"]]
model.build_vocab(new_sentences, update=True)
model.train(new_sentences, total_examples=model.corpus_count, epochs=5)
使用预训练GloVe向量
import numpy as np
def load_glove(file):
embeddings = {}
with open(file, 'r', encoding='utf8') as f:
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], dtype='float32')
embeddings[word] = vector
return embeddings
glove_vectors = load_glove("glove.6B.100d.txt")
句子向量化(均值法)
import numpy as np
def sentence_vector(sentence, model):
words = [w