当爬取好歌词之后,用Jieba库进行中文分词



import jieba import jieba.analyse import matplotlib.pyplot as plt def
setStopList(path): stopwords=[] for word in open(path):
stopwords.append(word.strip()) return stopwords def isChinese(word): for ch in
word: if u'\u4e00' <= ch <= u'\u9fff': return True else: return False def
isEnglish(word): for ch in word: if ch.isalpha(): return True else: return
False def splitWords(): wf=open('clean_Jay(chinese).txt','w+',encoding='utf-8')
#结果所保存的文件路径 with open('lyrics of Jay.txt','r',encoding='utf-8') as f:
#索要进行分词的歌词的文件路径 string = f.readlines() word_freq={} blank=['\xa0','\u3000',' ']
for line in string: item = line.strip('\n\r').split('\t')#制表格切分 str_load =
jieba.cut(item[0],cut_all=False) #默认模式 for word in str_load: if ( word not in
blank and word not in stopwords and (isChinese(word))): #把一个字的和空格的词语S去掉
wf.write(word) wf.write(' ') if word in word_freq: word_freq[word]+=1 else:
word_freq[word]=1 freq_word=[]#存储着单词及词频 for word,freq in word_freq.items():
freq_word.append((word,freq)) freq_word.sort(key=lambda x:x[1],reverse =
True)#倒序排列 print(freq_word) wf.close() return freq_word if __name__
=='__main__': stopwords=setStopList('stopwords.txt') splitWords()






然后用wordcloud库做可视化分析



import numpy as np import matplotlib.pyplot as plt from wordcloud import
WordCloud,STOPWORDS,ImageColorGenerator from PIL import Image figure_coloring =
np.array(Image.open('jay.png')) #根据情况设置停用词 stopwords = set(STOPWORDS)
stopwords.add('杨大纬') stopwords.add('杨瑞代') stopwords.add('混音') #mask为词云形状
wordcloud = WordCloud(max_font_size=40,max_words=4000,mask=
figure_coloring,background_color='white',font_path='msyh.ttc',margin=2)
#返回一个字典,存储着词频和高频次 def getWordAndFreq(): with
open('clean_Jay(chinese).txt','r',encoding='utf-8')as f: string = f.readlines()
words=[] for line in string: words.extend(line.strip().split(' '))
#print(words) freq_words={} for word in words: if word in freq_words:
freq_words[word]+=1 else: freq_words[word]=1 #print(freq_words) word_freq=[]
for word,freq in freq_words.items(): word_freq.append((word,freq))
word_freq.sort(key=lambda x:x[1],reverse=True) print(word_freq) return
freq_words,word_freq def writeFreqWords(word_freq): with
open("词频统计Jay.txt",'w+',encoding='utf-8')as f: f.writelines(str(word_freq))
f.write('\n') if __name__=='__main__': freq_words,word_freq=getWordAndFreq()
writeFreqWords(word_freq) #generate word cloud
wordcloud.generate_from_frequencies(freq_words)#generate可以实现自动分词,但对中文支持不好
#所以要先自己排好序,再生成词云 image_colors = ImageColorGenerator(figure_coloring)
plt.imshow(wordcloud) plt.axis('off') plt.figure()
plt.imshow(wordcloud.recolor(color_func=image_colors),interpolation="bilinear")
plt.axis('off') plt.show() wordcloud.to_file('jayresult.png')