1.文本数据的向量化

1.1名词解释

CF:文档集的频率,是指词在文档集中出现的次数

DF:文档频率,是指出现词的文档数

IDF:逆文档频率,idf = log(N/(1+df)),N为所有文档的数目,为了兼容df=0情况,将分母弄成1+df。

TF:词在文档中的频率

TF-IDF:TF-IDF= TF*IDF

1.2文本数据样本集

为了讲解文本数据的向量化,假设我们有4个文本,所有文本一共有6个不同的词,如下所示。

       
doc1iphone guucihuaweiwatchhuawei
doc2huaweiwatchiphonewatchiphonegucci
doc3skirtskirt skirtflower 
doc4watch watch huawei 
1.3计算汇总



 iphonewatchguccihuaweiskirtflower
doc1  TF1/51/51/52/500
doc2 TF2/62/61/61/600
doc3 TF00003/41/4
doc4 TF02/301/300

DF

含词的文档数
232311

IDF

逆文档频率

=log(N/(1+DF))

log(4/(1+2))

=log(4/3)

log(4/(1+3))

=log(4/4)

log(4/(1+2))

=log(4/3)

log(4/(1+3))

=log(4/4)

log(4/(1+1))

=log(4/2)

log(4/(1+1))

=log(4/2)

doc1 TFIDF1/5*log(4/3)1/5*log(4/4)1/5*log(4/3)2/5*log(4/4)00
doc2 TFIDF2/6*log(4/3)2/6*log(4/4)1/6*log(4/3)1/6*log(4/4)00
doc3 TFIDF00003/4*log(4/2)1/4*log(4/2)
doc4TFIDF02/3*log(4/4)01/3*log(4/4)00
       
       
1.4实现tf-idf

人肉完成,相对来说,tf-idf的实现还比较简单。
# -*- coding: utf-8 -*- """ Author:蔚蓝的天空tom Talk is cheap, show me the code
Aim:实现文本型数据的TF-IDF向量化 """ import numpy as np from
sklearn.feature_extraction.text import CountVectorizer from
sklearn.feature_extraction.text import TfidfTransformer def sklearn_tfidf():
tag_list = ['iphone guuci huawei watch huawei', 'huawei watch iphone watch
iphone guuci', 'skirt skirt skirt flower', 'watch watch huawei'] vectorizer =
CountVectorizer() #将文本中的词语转换为词频矩阵 X = vectorizer.fit_transform(tag_list)
#计算个词语出现的次数 transformer = TfidfTransformer() tfidf =
transformer.fit_transform(X) #将词频矩阵X统计成TF-IDF值 print(tfidf.toarray()) def
tfidf_alg(): docs = np.array(['iphone guuci huawei watch huawei', 'huawei watch
iphone watch iphone guuci', 'skirt skirt skirt flower', 'watch watch huawei'])
words = np.array(['iphone', 'guuci', 'huawei', 'watch', 'skirt', 'flower'])
#calc cf way1, 词在文档中出现的个数 cfs = [] for e in docs: cf = [e.count(word) for word
in words] cfs.append(cf) print('cfs way1:\n', np.array(cfs)) #calc cf way2,
词在文档中出现的个数 cfs = [] cfs.extend([e.count(word) for word in words] for e in docs)
cfs = np.array(cfs) print('cfs way2:\n', cfs) #calc tf way1, 词在文档中的频率 tfs = []
for e in cfs: tf = e/(np.sum(e)) tfs.append(tf) print('tfs way1:\n',
np.array(tfs)) #calc tf way2, 词在文档中的频率 tfs = [] tfs.extend(e/(np.sum(e)) for e
in cfs)#不能使用append() print('tfs:\n',np.array(tfs)) #calc df way1, 包含词的文档个数 dfs
= list(np.zeros(words.size, dtype=int)) for i in range(words.size): for doc in
docs: if doc.find(words[i]) != -1: dfs[i] += 1 print('calc df way1:', dfs)
#calc df way2, 包含词的文档个数 dfs = [] for i in range(words.size): oneHot =
[(doc.find(words[i]) != -1 and 1 or 0) for doc in docs]
dfs.append(oneHot.count(1)) #print('word',words[i],'df:',oneHot.count(1))
print('calc df way2:', dfs) #calc df way3, 包含文辞的文档个数 dfs, oneHots = [],[] for
word in words: oneHots.append([(e.find(word) != -1 and 1 or 0) for e in docs])
dfs.extend(e.count(1) for e in oneHots) print('calc oneHots way3:',
np.array(oneHots)) print('calc df way3:', dfs) #calc df way4, 包含词的文档个数 dfs = []
oneHots = [[doc.find(word) != -1 and 1 or 0 for doc in docs] for word in words]
dfs.extend(e.count(1) for e in oneHots) print('calc oneHots way4:',
np.array(oneHots)) #dfs = np.reshape(dfs, (np.shape(dfs)[0],1)) #列向量1×n
#print('calc df way4:', dfs) #calc idf, 计算每个词的idf(逆向文件频率inverse document
frequency) #log10(N/(1+DF)) N = np.shape(docs)[0] idfs =
[(np.log10(N*1.0/(1+e))) for e in dfs]#f(e) = np.log10(N*1.0/(1+e))
print('idfs:',np.array(idfs)) #calc tf-idf,计算term frequency - inverse document
frequency tfidfs = [] for i in range(np.shape(docs)[0]): word_tfidf =
np.multiply(tfs[i], idfs) tfidfs.append(word_tfidf)
#print('word_tfidf:',word_tfidf) print('calc tfidfs:\n', np.array(tfidfs))
print('==================result============================')
print('\ndocs:\n', np.array(docs)) print('\nwords:\n', np.array(words))
print('\noneHots:\n', np.array(oneHots)) print('\nCF:\n', np.array(cfs))
print('\nTF:\n', np.array(tfs)) print('\nDF:\n', np.array(dfs))
print('\nIDF:\n', np.array(idfs)) print('\nTF-IDF:\n', np.array(tfidfs))
print('==============================================') return if
__name__=='__main__': tfidf_alg() #sklearn_tfidf()
1.5运行结果
==================result============================ docs: ['iphone guuci
huawei watch huawei' 'huawei watch iphone watch iphone guuci' 'skirt skirt
skirt flower' 'watch watch huawei'] words: ['iphone' 'guuci' 'huawei' 'watch'
'skirt' 'flower'] oneHots: [[1 1 0 0] [1 1 0 0] [1 1 0 1] [1 1 0 1] [0 0 1 0]
[0 0 1 0]] CF: [[1 1 2 1 0 0] [2 1 1 2 0 0] [0 0 0 0 3 1] [0 0 1 2 0 0]] TF: [[
0.2 0.2 0.4 0.2 0. 0. ] [ 0.33333333 0.16666667 0.16666667 0.33333333 0. 0. ] [
0. 0. 0. 0. 0.75 0.25 ] [ 0. 0. 0.33333333 0.66666667 0. 0. ]] DF: [2 2 3 3 1
1] IDF: [ 0.12493874 0.12493874 0. 0. 0.30103 0.30103 ] TF-IDF: [[ 0.02498775
0.02498775 0. 0. 0. 0. ] [ 0.04164625 0.02082312 0. 0. 0. 0. ] [ 0. 0. 0. 0.
0.2257725 0.0752575 ] [ 0. 0. 0. 0. 0. 0. ]]
==============================================
(end)