原来你是这样的薛之谦

本文对薛之谦的部分歌词进行了文本挖掘处理，欢迎围观~

目标：绘制词云图，分析词频并可视化

文本预处理

import re
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

#读取原始文本文件
file=open('/home/fantasy/Desktop/xzq/薛之谦.txt')
#去掉制表符和换行符
c=[]
for line in file:
    line=line.strip('\n')
    if '\t' not in line:
        c.append(line)

#去掉非中文字符串
d=[]
for s in c:
    s=re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", s)
    d.append(s)

#去掉空字符串
dd=[]
for i in d:
    if i is not '' and ' ' and ' ' :
        dd.append(i)

#对文本进行分词处理
words=[]
for i in dd:
    w=jieba.cut(i)
    word=','.join(w)
    words.append(word)

1	words[0:10]

['\ufeff,夜深人静, ,那,是,爱情',
 '偷偷地,控制,着,我,的,心',
 '提醒,我, ,爱,你,要,随时,待命',
 ' ',
 '音乐,安静, ,还是,爱情,啊',
 '一步,一步,吞噬,着,我,的,心',
 '爱上你, ,我,失去,了,我,自己',
 ' ',
 '爱得,那么,认真, ,爱得,那么,认真',
 '可,还是,听见,了,你,说,不,可能']

1	wo=','.join(words)

type(wo)

str

词频统计

counter = {}
# 如果字典里有该词则加1，否则添加入字典
for s in words:
    for j in s.split(','):
        if j not in counter:
            counter[j] = 0
        else:
            counter[j] += 1
#词频从高到低排序
sorted_counter=sorted(counter.items(),key = lambda x:x[1],reverse = True)

1	sorted_counter

[(' ', 2424),
 ('我', 1388),
 ('的', 1380),
 ('你', 1189),
 ('了', 433),
 ('在', 358),
 ('：', 255),
 ('是', 251),
 ('都', 209),
 ('不', 187),
 ('着', 157),
 ('说', 157),
 ('就', 152),
 ('谁', 152),
 ('爱', 142),
 ('薛之谦', 129),
 ('还', 127),
 ('会', 121),
 ('要', 118),
 ('我们', 111),
 ('人', 108),
 ('好', 102),
 ('有', 100),
 ('过', 97),
 ('也', 93),
 ('这', 92),
 ('多', 88),
 ('和', 87),
 ('把', 82),
 ('又', 82),
 ('吗', 82),
 ('到', 80),
 ('自己', 79),


 ...]

1	x=sorted_counter[1:35]

[('我', 1388),
 ('的', 1380),
 ('你', 1189),
 ('了', 433),
 ('在', 358),
 ('：', 255),
 ('是', 251),
 ('都', 209),
 ('不', 187),
 ('着', 157),
 ('说', 157),
 ('就', 152),
 ('谁', 152),
 ('爱', 142),
 ('薛之谦', 129),
 ('还', 127),
 ('会', 121),
 ('要', 118),
 ('我们', 111),
 ('人', 108),
 ('好', 102),
 ('有', 100),
 ('过', 97),
 ('也', 93),
 ('这', 92),
 ('多', 88),
 ('和', 87),
 ('把', 82),
 ('又', 82),
 ('吗', 82),
 ('到', 80),
 ('自己', 79),
 ('让', 79),
 ('能', 78)]

xlabel=[]
ylabel=[]
for i in x:
    xl=i[0]
    yl=i[1]
    xlabel.append(xl)
    ylabel.append(yl)

xnames

['我', '的', '你', '了', '在', '：', '是', '都', '不', '着']

plt.style.use('ggplot')
from matplotlib.font_manager import FontProperties 
font = FontProperties(fname=r"/home/fantasy/Desktop/xzq/华文行楷.ttf", size=8) 
font1 = FontProperties(fname=r"/home/fantasy/Desktop/xzq/华文行楷.ttf", size=15)
xnames=xlabel
xx= range(len(xnames))
plt.plot(xx,ylabel,'o-',color='red')
plt.xticks(xx,xnames,rotation = 0,fontproperties=font)
plt.xlabel('排名前10的词',fontproperties=font1)
plt.ylabel('频数',fontproperties=font1)
plt.title('薛之谦歌词中出现字词排名靠前的几个',fontproperties=font1)

Text(0.5,1,'薛之谦歌词中出现字词排名靠前的几个')

png

可见，出现最多的字词为“我”，“的”，“你”

绘制词云图

fig=WordCloud(collocations=False,font_path='/home/fantasy/Desktop/xzq/华文行楷.ttf',width=1400,height=1400,min_font_size=3,margin=2).generate(wo)
plt.imshow(fig)
plt.axis("off")
plt.title('薛之谦歌词词云图',fontproperties=font1)

Text(0.5,1,'薛之谦歌词词云图')

png