以下分别是TF(词袋模型)/TF-IDF/skip-gram(Word2vec)的具体实现代码。
- 词袋模型
from sklearn.feature_extraction.text import CountVectorizer
#语料库
train_x= ['build fails due publication-tests.xml build target','due to sb']
test_x =['build one to ']
#将文本中的词语转换为词频矩阵 选择前256个词 相当于词向量的维度是256维的
cv_ = CountVectorizer(max_features=256)
#计算个词语出现的次数 此类方法一般先fit拟合,再transform转换
X = cv_.fit_transform(train_x)
#输出语料库
print('corpus',train_x)
#输出词典
print('feature_names',cv_.get_feature_names())
#输出词汇
print('vocabulary_',cv_.vocabulary_)
#输出模型参数
print('params',cv_.get_params(deep=True))
#输出词频
print(X)
#查看词频结果
print(X.toarray())
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
结果:
(0,0) 1 表示第1句预料中,第0个单词,出现的次数
封装为函数
from sklearn.feature_extraction.text import CountVectorizer
def count(train_x,test_x):
cv_ = CountVectorizer(max_features=256)
# 使用训练数据train_x进行词向量的拟合,得到训练集的词向量表示结果data_train_tf,然后输出矩阵形式
data_train_tf = cv_.fit_transform(train_x).toarray()
# 对测试集数据test_x进行词向量的转换
data_test_tf = cv_.transform(test_x).toarray()
return data_train_tf,data_test_tf
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# 理论同上
def tfidf(train_x,test_x):
tr_ = TfidfVectorizer(max_features=256)
data_train_tfidf = tr_.fit_transform(train_x).toarray()
data_test_tfidf = tr_.transform(test_x).toarray()
return data_train_tfidf,data_test_tfidf
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- Word2Vec
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import numpy as np
# 获取skip-gram的词向量 一句话中所有词向量求均值得到该句的向量
def get_embed(skip_,data_path,dim_size=300):
data_res=[]
for s_ in LineSentence(data_path):
vec_res = np.zeros(dim_size)
for t_ in s_:
try:
vec = skip_[t_] #不存在该单词
vec_res += vec
except:
vec = np.zeros(dim_size)
vec_res += vec
data_res.append(vec_res/len(s_))
return np.array(data_res)
# 输出skip-gram表示的词向量 词向量大小维度为300 retain表示是否覆盖之前的训练结果
def word2vec(train_x,test_x,soft,dim_size=300,retrain=True):
data_path_train = 'F:/shiyan/bert/dataset/generate/'+soft+'/train_x.txt'
save_path_train = 'F:/shiyan/bert/dataset/generate/'+soft+'/skip_.model'
if retrain:
# 将训练数据train_x以一行一句话的形式存入txt文件中
train_x.to_csv(data_path_train, sep='\t',index=False, header=None)
# 训练skip-gram
skip_ = Word2Vec(LineSentence(data_path_train),size=dim_size,window=10,sg=1)
# 保存模型
skip_.save(save_path_train)
# 单词 向量的存储
skip_.wv.save_word2vec_format('F:/shiyan/bert/dataset/generate/'+soft+'/skip_keys.model', binary=False)
else:
# 若之前训练过,这里直接加载训练好的模型
skip_ = Word2Vec.load(save_path_train)
data_train_skip = get_embed(skip_,data_path_train,dim_size=dim_size)
# 加入测试集再次训练
data_path_test = 'F:/shiyan/bert/dataset/generate/'+soft+'/test_x.txt'
save_path_test = 'F:/shiyan/bert/dataset/generate/'+soft+'/skip_latter.model'
test_x.to_csv(data_path_test, sep='\t',index=False, header=None)
# 对模型进行微调
skip_.train(LineSentence(data_path_test),total_examples=skip_.corpus_count,epochs=skip_.epochs)
skip_.save(save_path_test)
skip_.wv.save_word2vec_format('F:/shiyan/bert/dataset/generate/'+soft+'/skip_keys_latter.model', binary=False)
data_test_skip = get_embed(skip_,data_path_test,dim_size=dim_size)
return data_train_skip,data_test_skip
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49