jieba分词以及wordcloud词云

1.从网上下载一份 天龙八部的txt文档以及一份通用的jieba停用词表

 

2.下载一个背景  图片.jpg

 

3.检查一个字体文件   C:/Windows/Fonts/simsun.ttc

 

# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
from PIL import Image
import  numpy as np
from wordcloud import WordCloud,ImageColorGenerator
import  matplotlib.pyplot as plt
#中文分词,将 天龙八部.txt 文档 除去停用词进行分词,将分词结果导入天龙八部分词.txt
stopwords= [line.strip() for line in open("./停用词表.txt",encoding="utf-8")]
def seg_sentence(sentence):
    sentence_seged = [word for word in jieba.cut(sentence.strip()) if (word not in stopwords and word != '\t') ]
    result = ' '.join(sentence_seged)
    return result
outputs = open("天龙八部分词.txt","w",encoding='utf-8')
for line in open("./天龙八部.txt",'r',encoding='GB18030'):
    line_seg = seg_sentence(line)
    outputs.write(line_seg+'\n')
outputs.close()


#采用TF-IDF算法进行关键词提取,返回关键词及IF-IDF权重
text = open("./天龙八部分词.txt",encoding="utf-8").read()
result = jieba.analyse.extract_tags(text,topK=20,withWeight=True,allowPOS=('nr',))
print (result)

#将结果[('段誉', 0.5881865046044787), ('萧峰', 0.4631424402591722).....]装换为字典做 词云模块的输入
keywords = dict()
for i in result:
    keywords[i[0]]=i[1]
    
#词云背景
image = Image.open('./图片.jpg')
graph = np.array(image)
wc = WordCloud(font_path='C:/Windows/Fonts/simsun.ttc',
               background_color ="White",
               max_words=15,
               mask= graph)
#生成词云
wc.generate_from_frequencies(keywords)
plt.imshow(wc)
image_color = ImageColorGenerator(graph)
plt.axis("off")
plt.show()
wc.to_file('词云.jpg')

 

上一篇:jieba的一些使用


下一篇:python第三方库安装