中文情感分类代码

## 中文情感分类--关于疫情、微博、中文、文本

本次中文情感分析源于数据挖掘与分析课大作业,主要内容为:对疫情期间的微博文本进行情感分类,进而分析情感变化。

 1. 数据集:训练集和待预测数据集,其中训练集为打好标签的微博疫情相关文本,待预测训练集为情感趋势来源。
 2. python库:主要使用 jieba、pandas,其余详见import
 3. 主要涉及内容有:分词,去停用词,构建词向量模型,分词文本向量化,模型训练,预测等部分。

【文件路径\\、/没有修改成一致。部分代码不够简洁流畅,仅提供步骤参考,相关文件、代码(同组成员的微博爬虫、清洗、以及数据集链接)会考虑需要上传】


part 1:训练集文本

    --分词,去停,构建词向量(这里没有用pandas,十分后悔)

1.import部分及main方法:

import jieba
import numpy as np
import pandas as pd
import os

import gensim
from gensim.test.utils import common_texts,get_tmpfile
from gensim.models import Word2Vec

import math

import csv

if __name__=='__main__':
    
    data = pd.read_csv('D:\\documents\\data mining\\数据集\\情感分类-疫情微博\\nCoV_100k_train.labled.csv',engine="python")
    #data = pd.read_csv('D:\\documents\\data mining\\数据集\\普通情感分类-7\\情感训练集.csv')
    #print(data.head())

    #提取目标列,第2列
    data1 = list(data.iloc[:,3]) #根据数据集修改 100k-3,情感训练集-0
    #print(data1[0])
    label = list(data.iloc[:,6]) #根据数据集修改 100k-6,情感训练集-1

    #分词
    size = 100 #词向量模型
    (data2,label) = word_cut(data1,label,size) #返回分词后列表,以字符串为元素,字符串用','隔开字符
    print('分词成功')

    print(len(data2),len(label))

2.分词,去停,词向量


def word_cut(data1,label,size):

    filelist = []
    for i in data1:
        i=str(i)
        i = i.replace('展开全文c','')
        s=jieba.cut(i,cut_all=False)
        cutstr = '$$$'.join(s)
        
        '''
        s1 = iter(s)
        cutstr=''
        for i in s1:
            if cutstr =='':
                cutstr+=i
            else:
                cutstr+='$$$'
                cutstr+=i
        '''
        textlist = cutstr.split('$$$')
        
        #print(textlist)
        
        filelist.append(textlist)
        

    filelist = removesw(filelist) #去停用词后的list,可能有空

    j=0
    for i in range(len(filelist)):#删除空值
        if len(filelist[i-j])== 0:
            del filelist[i-j]
            del label[i-j]
            j+=1

    #print(len(filelist),len(label))
    #print(filelist[0],label[0])
    #print(filelist[1],label[1])
    #print(filelist[-2],label[-2])
    #print(filelist[-1],label[-1])

    #打开txt
    txtfile = open('D:/documents/data mining/数据集/代码/data_cut.txt',mode = 'w')
    
    for i in range(len(filelist)):
       string=''
       for j in filelist[i]:
           if j != '':
               if string == '':
                   string += j
               else:
                   string += ','
                   string += j
      
       ##写入txt文件  #分词+label
       txtfile.write(string.encode("gbk", 'ignore').decode("gbk", "ignore")+' '+str(label[i])+'\n')

    txtfile.close()
    print('cut_word写入txt')

    model = Word2Vec(filelist,size=size,window=5,min_count=1,workers=4)
    model.save("D:/documents/data mining/数据集/代码/word2vec.bin")
    print('cut_word加入词向量模型')
    
    return (filelist,label)



本段主要为 利用结巴分词进行分词,分词结果使用$$$分隔,使用下方去停方法。

将去停后的分词文本加入词向量模型,其中word2vec中的filelist只要为可循环的变量均可,后续往词向量模型加入,以及获得文本向量的语句见part2.


def removesw(filelist):  #filelist:由分词构成的list
    stop_word = None
    
    with open('D:/documents/data mining/数据集/stopwords-master/cn_stopwords.txt','r',encoding = 'utf-8') as f:
        stop_words = f.readlines()
        stop_words = [word.replace('\n','') for word in stop_words]

    # stop word 替换
    #i=0
    for i in range(len(filelist)):
        filelist[i]=[x for x in filelist[i] if x not in stop_words]

    return filelist

本段去停用词,txt为网络找的停用词表,中途会根据微博语境增删改。for循环里的代码比较核心。

part 2:预测集数据

  --本部分主要使用pandas库,对预测集分词、去停,结果加入part1中构建的词向量模型。然后利用词向量模型、训练集&预测集分析结果,构建文本向量并写入.csv文件。

1.import部分+数据清洗、分词、去停

(清洗部分希望去掉部分无意义词段,防止分词后无法去除。)

import os
import pandas as pd
import jieba

import gensim
from gensim.test.utils import common_texts,get_tmpfile
from gensim.models import Word2Vec

import numpy as np
import csv



#----数据清洗,分词----
with open('D:/documents/data mining/数据集/stopwords-master/cn_stopwords.txt','r',encoding = 'utf-8') as f:
    stop_words = f.readlines()
    stop_words = [word.replace('\n','') for word in stop_words]
    stop_words.append('\u200b')

origin_dir='D:\\documents\\data mining\\数据集\\代码\\cleaned_text\\'
files=os.listdir(origin_dir)
after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'

def clean_mix(s):
    #print(type(s))
    return s.replace('收起全文d','').replace('展开全文d','').replace('的秒拍视频','').replace('的微博视频','').replace('的快手视频','').replace('\n','').replace('O网页链接','')

def after_jieba_stopword(s):
    a=jieba.cut(str(s),cut_all=False)
    b = '$$$'.join(a)
    c=[x for x in b.split('$$$') if x not in stop_words]
    return ' '.join(c)

N_origin=0
N_filter=0
for file in files:
    data=pd.read_table(origin_dir+file,sep=',',encoding='utf-8')
    N_origin+=len(data)
    #分词
    data['cleaned_text']=data['cleaned_text'].map(lambda x:clean_mix(str(x)) if type(x)==type('') else '') #去词
    data['cleaned_text']=data['cleaned_text'].map(lambda x:after_jieba_stopword(x)) #分词,去停用词
    data['removeWellSign']=data['removeWellSign'].map(lambda x:clean_mix(str(x)) if type(x)==type('') else '')
    data['removeWellSign']=data['removeWellSign'].map(lambda x:after_jieba_stopword(x))
    data_filter=data.loc[data['cleaned_text']!='',:] 
    data_filter['id']=np.arange(0,len(data_filter),1)
    N_filter+=len(data_filter)
    data_filter[['id','original_text','cleaned_text','removeWellSign']].to_csv(after_clean_dir+file,sep=',',index=None,encoding='utf-8')
    print(file,'over')

print(N_origin)
print(N_filter)

2.词向量模型训练

  --待预测数据集分词结果加入词向量模型


#训练模型,向量化
after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
files=os.listdir(after_clean_dir)
model = Word2Vec.load("D:/documents/data mining/数据集/代码/word2vec.bin")

for file in files:
    data=pd.read_table(after_clean_dir+file,sep=',',encoding='utf-8')
    filelist=list(data['cleaned_text'].map(lambda x:x.split(' ')) )

    model.train(filelist,total_examples=model.corpus_count,epochs= model.iter)
    print(file,'train over')
    
model.save("D:/documents/data mining/数据集/代码/word2vec.bin")
print('预测文本加入词向量模型-成功')

3.文本向量化

利用分词后的文本,分别从词向量模型中获得词语对应向量(向量中不包含所有词),加总(权重为1)、平均,得到句子对应文本向量。

#模型106万条文本的向量化
after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
vectors_dir='D:\\documents\\data mining\\数据集\\代码\\vectors\\'
files=os.listdir(after_clean_dir)
model = Word2Vec.load("D:/documents/data mining/数据集/代码/word2vec.bin")

for file in files:
    data=pd.read_table(after_clean_dir+file,sep=',',encoding='utf-8')
    filelist=list(data['cleaned_text'].map(lambda x:x.split(' ')))

    df=pd.DataFrame()

    for text in filelist:
        text_vector = np.zeros(100).reshape((1,100))
        count = 0
        for word in text:
            try:
                text_vector += model[word].reshape((1,100))
                #print(word,model[word])
                count += 1
            except KeyError:
                continue
        if count !=0:
            text_vector /= count #count个单词,所以除以count
            
        vector_list= list(list(text_vector)[0])
        
        df=df.append(pd.Series(vector_list),ignore_index=True)

    df.to_csv(vectors_dir+file,sep=',',index=None,header=None)
    print(file,'train over')

#---训练集文本向量化---
model = Word2Vec.load("D:/documents/data mining/数据集/代码/word2vec.bin")
txtfile = open('D:\\documents\\data mining\\数据集\\代码\\data_cut.txt','r')

data=[]
for i in txtfile.readlines():
    a=i.split(' ')
    a = [word.replace('\n','') for word in a]
    #print(a)
    data.append(a) #[[cut_word,label],[cut_word,label]]

for i in data:
    text = i[0].split(',')
    text_vector = np.zeros(100).reshape((1,100))

    count = 0

    for word in text:
        try:
            text_vector += model[word].reshape((1,100))
            count += 1
        except KeyError:
            continue
    if count !=0:
        text_vector /= count #count个单词,所以除以count

    vector_list= list(list(text_vector)[0])
    #print(i[0],vector_list)
    i=i.append(vector_list) #

print(data[0])


with open('D:\\documents\\data mining\\数据集\\代码\\trainText_vector.csv','w',newline='') as tf:
    writer = csv.writer(tf,delimiter = ',')
    #writer.writerow(file_columns)
    for row in data:
        #print(row)
        row1 = row[2]
        row1.append(int(row[1]))
        #print(row1)
        writer.writerow(row1)
    tf.close()
print('训练文本向量化完成')

4.模型训练

--这里的模型为决策树模型,使用OneVsOne分类方式,是经过挑选的。训练过程中,将训练集向量9:1分为训练集和测试集,正确率较高,且在预测分类中效果较好。

from sklearn.multiclass import OneVsOneClassifier
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

from joblib import dump, load

#---模型训练及预测---
after_clean_dir='D:\\documents\\data mining\\数据集\\代码\\after_clean\\'
vectors_dir='D:\\documents\\data mining\\数据集\\代码\\vectors\\'
label_dir='D:\\documents\\data mining\\数据集\\代码\\text_label\\'
files=os.listdir(after_clean_dir)

#模型训练
labeled_path = 'D:\\documents\\data mining\\数据集\\代码\\trainText_vector.csv'

labeled=pd.read_table(labeled_path,sep=',')
n=len(labeled)#11281

vectors=labeled.iloc[:,:-1]
labels=labeled.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size=0.2)

y_test_list=list(y_test)

y_train_list2=np.array(list(y_train.map(lambda x:[x])))
X_train_list=np.array(X_train)
X_test_list=np.array(X_test)

n_train=len(y_train)#10152
n_test=len(y_test)#1129

def accuracy(a,b):
	c=[]
	for i in range(len(a)):
		if a[i]==b[i]:
			c.append(1)
		else:
			c.append(0)
	return sum(c)/len(c)


model_tree_one=OneVsOneClassifier(DecisionTreeRegressor()) #2v2
model_tree_one.fit(X_train,y_train)
predict_tree_one=model_tree_one.predict(X_test)
print(predict_tree_one)
accuracy_tree_one=accuracy(predict_tree_one,y_test_list) #0.7478753541076487
print("accuracy_tree_one:"+str(accuracy_tree_one))

dump(model_tree_one,'model_tree_one.joblib')
print('预测模型建立并存储完成')

5.情感分类预测

#预测
#model_tree_one=load('D:\\documents\\data mining\\数据集\\代码\\model_tree_one.joblib')
model_tree_one=load('D:\\documents\\data mining\\数据集\\代码\\svc.joblib')
for file in files:
    vectors_file=pd.read_table(vectors_dir+file,sep=',',header=None)
    text_file=pd.read_table(after_clean_dir+file,sep=',')
    
    result=model_tree_one.predict(vectors_file)

    text_file['label']=result

    text_file.to_csv(label_dir+file,sep=',',index=None)
    print(file,'predict over')

6.随便输出到.csv的分类结果(积极,消极,总数等)

# 预测结果统计
from pandas import DataFrame
analysis_dir = 'D:\\documents\\data mining\\数据集\\代码\\text_label\\'
analysis_files = os.listdir(analysis_dir)
#analysis_data = {'date':[],'neg':[],'pos':[],'total':[]}
analysis_df = DataFrame(data=[],index=[],columns=['deta','neg','pos','total'])

for file in analysis_files:
    analysis_file = pd.read_table(analysis_dir+file,sep=',')

    #pos = analysis_file.loc[analysis_file['label'] == '1',:].count()
    #neg = analysis_file.loc[analysis_file['label'] == '-1',:].count()
    vc=analysis_file['label'].value_counts(normalize = False, dropna = False)

    pos = vc[1]
    neg = vc[-1]
    total = analysis_file['label'].count()

    print(file,neg,pos,total) #

    analysis_df=analysis_df.append(pd.DataFrame([[file.replace('.csv','').replace('.','-'),neg,pos,total]],columns=['deta','neg','pos','total']))


analysis_df.to_csv('D:\\documents\\data mining\\数据集\\代码\\结果图.csv',sep=',',index=None)
    

 

上一篇:Spring系列学习笔记六:invokeBeanFactoryPostProcessors解析


下一篇:可执行jar包在windows server2008下的自启动