pySVNA 2019-12-24
博客搬家:用户评论情感极性判别
本文章介绍百度点石平台上的一个训练赛的赛题代码,赛题是包括用户评论文字的情感判别的分类问题,赛题链接戳此处
使用测试数据和训练数据生成语料库
import numpy as np import jieba import codecs # 该函数作用是读取文件 def load_data(file_path): data_set = [] with open(file_path, 'r') as lines: for line in lines: line=line.strip() values=line.split("\t") data_set.append(values) np.array(data_set) # print(data_set[0]) return data_set dataAll=load_data('data_train.csv') dataTest=load_data('data_test.csv') csvfile = codecs.open("fenci_result.csv", 'w', 'utf-8') #f=open('fenci_result.txt','a') for item in dataAll: seg_list=jieba.cut(item[2])#使用结巴分词 csvfile.write(" ".join(seg_list))#以空格隔开把分好的词写入文件,形成语料 #f.close() for item in dataTest: seg_list=jieba.cut(item[-1]) csvfile.write(" ".join(seg_list))
利用语料库,使用word2vec工具,生成可备用的模型,用于将句子转化为向量
from gensim.models import word2vec import logging logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO) sentences = word2vec.Text8Corpus("fenci_result.csv") # 加载语料 model = word2vec.Word2Vec(sentences, size = 400) # 训练skip-gram模型 # 保存模型,以便重用 model.save("corpus.model") model.wv.save_word2vec_format("corpus.model.bin", binary = True)
感觉训练方式很简陋,有待改善
#本程序用来测试模型 #coding=utf-8 import re import numpy as np import jieba from gensim.models import word2vec import logging import codecs from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn import svm from sklearn.metrics import accuracy_score,confusion_matrix, f1_score, precision_score, recall_score, roc_curve # 导入指标库 import prettytable # 导入表格库 # 该函数作用是读取文件 def load_data(file_path): data_set = [] with open(file_path, 'r') as lines: for line in lines: line=line.strip() values=line.split("\t") data_set.append(values) np.array(data_set) # print(data_set[0]) return data_set #写文件 def write_result(array, outpuFilePath): with open(outpuFilePath, 'w') as output_file: for i in range(len(array)): output_file.write("%d,%d\n" % (i+1,array[i])) #将句子转化为向量 def getWordVecs(wordList): vecs = [] for word in wordList: word = word.strip() try: vecs.append(model[word]) except KeyError: continue # vecs = np.concatenate(vecs) return np.array(vecs, dtype = 'float') model = word2vec.KeyedVectors.load_word2vec_format("corpus.model.bin", binary = True) # segList=jieba.cut('烤鸭还是不错的,别的菜没什么特殊的') # resultList = getWordVecs(segList) # print(sum(np.array(resultList))/2) dataAll=load_data('data_train.csv') X=[] y=[] dataAll=np.array(dataAll[:1500]) for item in dataAll: #temp=int(item[-1]) #y.append(temp if temp!=0 else 1)#把0都替换成1,先对2和1进行分类 y.append(int(item[-1])) segList=jieba.cut(item[2]) vecList=getWordVecs(segList) if len(vecList) != 0: X.append(sum(np.array(vecList))/len(vecList)) X=X[:] x_train=np.array(X) y_train=np.array(y) print(x_train) print(y_train) # x_train = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2], [2, 1], [3, 2]]) # print(x_train) # 使用sklearn的PCA进行维度转换 model_pca = PCA(n_components=0.95) # 建立PCA模型对象 model_pca.fit(x_train) # 将数据集输入模型 #model_pca.transform(x_train) # 对数据集进行转换映射 newX=model_pca.fit_transform(x_train)#进行转换映射,并将转换后的赋给newX components = model_pca.components_ # 获得转换后的所有主成分(不明白什么意思) components_var = model_pca.explained_variance_ # 获得各主成分的方差 components_var_ratio = model_pca.explained_variance_ratio_ # 获得各主成分的方差占比 print("\n主成分分析:") print (components) # 打印输出前2个主成分 print (len(components_var)) # 打印输出所有主成分的方差 print (components_var_ratio) # 打印输出所有主成分的方差占比 print(len(newX)) print(len(newX[0])) X_train, X_test, y_train, y_test = train_test_split(newX, y_train, test_size=.3, random_state=0) clf = svm.SVC(C=1, kernel='linear',decision_function_shape='ovr') clf.fit(X_train, y_train) y_hat=clf.predict(X_test) ##评价指标 accuracy_s = accuracy_score(y_test, y_hat) # 准确率 precision_s = precision_score(y_test, y_hat, average='macro') # 精确度 recall_s = recall_score(y_test, y_hat, average='macro') # 召回率 f1_s = f1_score(y_test, y_hat, average='weighted') # F1得分 print('Accuracy:') print(accuracy_s) print('Precision:') print(precision_s) print('Recall:') print(recall_s) print('f-measure:') print(f1_s) ##混淆矩阵 confusion_m = confusion_matrix(y_test,y_hat) # 获得混淆矩阵 confusion_matrix_table = prettytable.PrettyTable() # 创建表格实例 confusion_matrix_table.add_row(confusion_m[0, :]) # 增加第一行数据 confusion_matrix_table.add_row(confusion_m[1, :]) # 增加第二行数据 confusion_matrix_table.add_row(confusion_m[2, :]) # 增加第三行数据 print ('confusion matrix') print (confusion_matrix_table) # 打印输出混淆矩阵 write_result(y_hat,'print.csv')
使用所有训练数据训练模型并对test数据进行预测
#本程序用来进行预测 #coding=utf-8 import re import numpy as np import jieba from gensim.models import word2vec import logging import codecs from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn import svm # 该函数作用是读取文件 def load_data(file_path): data_set = [] with open(file_path, 'r') as lines: for line in lines: line=line.strip() values=line.split('\t') data_set.append(values) np.array(data_set) # print(data_set[0]) return data_set #写文件 def write_result(array, outpuFilePath): with open(outpuFilePath, 'w') as output_file: for i in range(len(array)): output_file.write("%d,%d\n" % (i+1,array[i])) #将句子转化为向量 def getWordVecs(wordList): vecs = [] for word in wordList: word = word.strip() try: vecs.append(model[word]) except KeyError: continue # vecs = np.concatenate(vecs) return np.array(vecs, dtype = 'float') #对预测数据进行处理 def preDataHandle(): preData=load_data('data_test.csv') #exit(0) xPre=[] i=0 k=0 for item in preData: i+=1 s='' for j in range(len(item)): if(j>1): s="%s%s"%(s,item[j]) segList=jieba.cut(s) vecList=getWordVecs(segList) if len(vecList) != 0: xPre.append(sum(np.array(vecList))/len(vecList)) else: k+=1 print('存在vecList长度为0的情况') print(item) x_pre=np.array(xPre) model_pca = PCA(n_components=factorNum) # 建立PCA模型对象 model_pca.fit(x_pre) # 将数据集输入模型 x_pre=model_pca.fit_transform(x_pre)#进行转换映射 return x_pre model = word2vec.KeyedVectors.load_word2vec_format("corpus.model.bin", binary = True) dataAll=load_data('data_train.csv') X=[] y=[] #dataAll=np.array(dataAll[:1500]) for item in dataAll: print(item) y.append(int(item[-1])) segList=jieba.cut(item[2]) vecList=getWordVecs(segList) if len(vecList) != 0: X.append(sum(np.array(vecList))/len(vecList)) else: print(item) X=X[:] x_train=np.array(X) y_train=np.array(y) model_pca = PCA(n_components=0.95) # 建立PCA模型对象 model_pca.fit(x_train) # 将数据集输入模型 #model_pca.transform(x_train) # 对数据集进行转换映射 newX=model_pca.fit_transform(x_train)#进行转换映射,并将转换后的赋给newX factorNum=len(newX[0]) clf = svm.SVC(C=1, kernel='linear',decision_function_shape='ovr') clf.fit(newX, y_train) x_pre=preDataHandle() y_pre=clf.predict(x_pre) write_result(y_pre,'output.csv') print('Project has been finished successfully!')
比赛平台上计算出的结果f1-score为0.7249,很低,希望再接再厉