统计分词





#!usr/bin/python
#coding=utf-8

import urllib2
import sys, time, re
import sys
sys.path.append("../")
import jieba
jieba.load_userdict("userdict.txt")
import jieba.analyse
import jieba.posseg as pseg
import os
jieba.initialize()
import operator


'''
1:清理字符串，提取特殊字符串
    a:去掉空白字符
    b:提取书名 <>
    c:日期 提取

关键词也不一定就是文章所要表达的意思，只能部分反映出文章的中心词。
'''

t1 = time.time()
url = "10.txt"
content = open(url, "rb").read()
#print type(content)
print '文章长度：'.decode('utf-8'), len(content)
strRe = re.sub('s', '', content)   #用正则干掉所有的空白
print '用正则干掉所有的空白后，字符长度'.decode('utf-8'), len(strRe)

'''
fo = open("foo.txt", "wb")
fo.write(strRe);
# 关闭打开的文件
fo.close()
'''

#分词， 未登录词用veterbi分词
words = list(jieba.cut(strRe, cut_all=False))
print "分词的总数：", len(words)
wordset = sorted(set(words))
print "不重复的单词数：".decode('utf-8'), len(wordset)

#TF-IDF
#jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
#tags = jieba.analyse.extract_tags(strRe, topK=10)
#print "TF-IDF 未去除停用词".decode('utf-8')
#print(",".join(tags))

jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
jieba.analyse.set_stop_words("../extra_dict/cn_stop_words.txt")
tags = jieba.analyse.extract_tags(strRe, topK=10)
print "TF-IDF 去除停用词".decode('utf-8')
print(",".join(tags))

#TextRank
#tagswords = jieba.analyse.textrank(content)
#print(",".join(tagswords))

print "TextRank"
seg_list = jieba.analyse.textrank(strRe)
print(",".join(seg_list))

'''
list = words
fl = open('list.txt', 'wb')

for i in range(len(list)):
    fl.write(list[i].encode('utf-8')+'--')
    
fl.close()
'''

'''

print "获取10个关键词".decode('utf-8')
seg_list = jieba.analyse.textrank(strRe)
for i in range(len(seg_list)):
    print seg_list[i].encode('gb2312')
    
# 统计分词结果后，每个个分词的次数
wordsDict = {}
DictsMaxWordlen = 0
singal = ''
for w in words:
    if wordsDict.get(w) == None:
        wordsDict[w] = 1
    else:
        wordsDict[w] += 1
        
    if DictsMaxWordlen <= wordsDict[w]:
        DictsMaxWordlen = wordsDict[w]
        global singal 
        singal = w
        #print w

print "分词最多重复的次数：".decode('utf-8'), DictsMaxWordlen , "分词是：".decode('utf-8'),singal

#按字典值排序（默认为升序），返回值是字典{key, tuple}
sorted_wordsDict = sorted(wordsDict.iteritems(), key=operator.itemgetter(1))
#print type(sorted_wordsDict[1])    #tuple


classNumWord = {}

for w in sorted_wordsDict:
    if classNumWord.has_key(w[1]) == True:
        if w[0] not in classNumWord[w[1]]:  
            classNumWord[w[1]].append(w[0])
    else:
        classNumWord[w[1]] = []
        classNumWord[w[1]].append(w[0])
#将字典排序，按照升序, 通过键排序，
sort_classNumWord = sorted(classNumWord.iteritems(), key=lambda asd:asd[0], reverse = False)
#print sort_classNumWord[20][1][0].encode('gb2312') 

wordslength = 0     #分词的总数
worldsNum = 0       #分词有多少个不同的词或词组
wordsFequencelist = {}  #分词出现的频次等级，从1到N次,并存储所对应等级的词语个数
for w in sort_classNumWord:
    worldsNum += w[0]
    wordslength += len(w[1]) * w[0]
    
    wordsFequencelist[w[0]] = []
    wordsFequencelist[w[0]].append(len(w[1]))
        
    #print "============================" 
    #for i in range(len(w[1])):     #按照出现的频次，打印词组
     #   print w[1][i]
    #print "出现".decode('utf-8'),w[0], "次的有：".decode('utf-8') ,len(w[1])
    #print "============================"      

print wordsFequencelist
print "一共有".decode('utf-8'), worldsNum,'个不同的词或词组'.decode('utf-8')
print "一共有".decode('utf-8'), wordslength, '个词或词组'.decode('utf-8')

'''


t2 = time.time()
tm_cost = t2-t1
print '程序运行时间'.decode('utf-8'),tm_cost