paper about spring

一、解析用户原始信息的json文件

#!/usr/bin/python
# -*- coding=utf-8 -*-

import os
import sys

import json

def main():

    root_dir = sys.argv[1]
    
    province_file = root_dir +"/conf/province.list"
    fin = open(province_file, 'r')
    provinces = set()
    for line in fin:
        province = line.strip()
        provinces.add(province)
    fin.close()

    input_file  = root_dir +"/source_data/userinfo.json"
    output_file = root_dir +"/result_data/userinfo.data"

    fin = open(input_file, 'r')
    fout = open(output_file, 'w')
    for line in fin:
        if line.strip() == "[]":
            continue
        json_file = json.loads(line.strip())
        userid   = json_file['userId']
        sex      = json_file['sex']
        location = json_file['location']
        birthday = json_file['birthday']
        attentioncount = json_file['attentionCount']
        fanscount      = json_file['fansCount']
        weibocount     = json_file['weiboCount']
        label_list=json_file['labelList']
        user_introduce=json_file['userIntroduce']
        if not sex:
            sex = 'null'
        if location.find(' ') != -1:
            fields = location.split(' ')
            location = fields[0]
        elif location:
            for province in provinces:
                if location.find(province) != -1:
                    location = province
        if not location :
            location = 'null'
        index = birthday.find('')
        if index != -1:
            birthday = birthday[0:index]
        else:
            birthday = 'null'
        if not attentioncount:
            attentioncount = '0'
        if not fanscount:
            fanscount = '0'
        if not weibocount:
            weibocount = '0'
        if not label_list or not label_list.strip():
            label_list='null'
        if not user_introduce or not user_introduce.strip():
            user_introduce='null'
        
        print>>fout, "%s	%s	%s	%s	%s	%s	%s	%s	%s"%(userid, sex, location, birthday, attentioncount, fanscount, weibocount,label_list,user_introduce)
    fin.close()
    fout.close()

if __name__ == "__main__":

    main()
UserInfoParser

1,用户的标签需要进一步进行分词处理

2,根据这份数据,打了标签的用户大概占据总用户的1/3

3,对于没有标签的,这里使用的是null标注

如果需要删除没有标签的记录,那么相关的shell语句为:

cat userinfo.data | awk -F '	' '{print $8}'|sed /null/d
cat userinfo.data | cut -f 8|sed /null/d
cat userinfo.data | awk -F '	' {if $8!=null print $8}
UserShell

 二、进行映射以及排序的一些操作

#!/usr/bin/python

import os
import sys

def main():

    root_dir = sys.argv[1]
    topN     = int(sys.argv[2])

    topic_total_file = root_dir +'/result_data/topic_id.data.total'
    id_topic = {}
    fin = open(topic_total_file, 'r')
    for line in fin:
        fields = line.strip().split('	')
        id_topic[fields[1]] = fields[0]
    fin.close()

    topicid_count = {}
    sources = ['sina', 'tencent']
    for source in sources:
        input_file = root_dir +'/result_data/'+ source +'.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            if fields[2] == '-1':
                continue
            topics = fields[2].split(':')
            for topic in topics:
                if topic in topicid_count:
                    topicid_count[topic] += 1
                else:
                    topicid_count[topic] = 1
        fin.close()
    sort_topic = sorted(topicid_count.items(), key = lambda d:d[1], reverse=True)
    if len(sort_topic) < topN:
        topN = len(sort_topic)
    output_file = root_dir +'/result_data/topic_id.data'
    fout = open(output_file, 'w')
    for i in range(topN):
        print>>fout, "%s	%s	%s"%(sort_topic[i][0], id_topic[sort_topic[i][0]], topicid_count[sort_topic[i][0]])
    fout.close()

if __name__ == "__main__":
    
    main()
TopN_topic

 1)构建两个列表,一个存储id和topic的对应关系,一个存储id和该topic出现次数的对应关系

2)按照次数排序的时候采用sorted(dict.items(),key=lambda d:d[1],reverse=True)进行,这样排序之后得到一个元组构成的列表

三、文档id分配、停用词处理

#!/usr/bin/python

import os
import sys

def main():

    if len(sys.argv) != 4:
        print "error parameters!"
        sys.exit(0)

    root_dir = sys.argv[1][0:sys.argv[1].rfind('/')]
    input_dir = sys.argv[1]
    output_root_dir = sys.argv[2]
    topic_multiple = float(sys.argv[3])
    
    # stopwords
    stopwords_file = root_dir +'/conf/stopwords.list'
    fin = open(stopwords_file, 'r')
    stopwords = set()
    for line in fin:
        word = line.strip()
        stopwords.add(word)
    fin.close()

    # generate ntopics_alpha.data
    cmd = "wc -l "+ root_dir +"/result_data/topic_id.data | awk -F' ' '{print $1}'"
    num_topics = int(int(os.popen(cmd).read().strip()) * topic_multiple)
    alpha = 50 / float(num_topics)
    ntopics_alpha_file = output_root_dir +'/ntopics_alpha.data'
    fout = open(ntopics_alpha_file, 'w')
    print>>fout, "%s	%s"%(num_topics, alpha)
    fout.close()

    # allocate docid and remove stopwords
    source_list = ['sina', 'tencent', 'tianya']
    for source in source_list:
        input_file = input_dir +'/'+ source +'.data'
        cmd = "wc -l "+ input_file +" | awk -F' ' '{print $1}'"
        line_number = os.popen(cmd).read().strip()
        output_file = output_root_dir +'/'+ source +'/source.data'
        fin = open(input_file, 'r')
        fout = open(output_file, 'w')
        print>>fout, line_number
        docid = {}
        allocate_id = 0
        for line in fin:
            fields = line.strip().split('	')
            doc    = fields[0]
            docid[doc] = allocate_id
            allocate_id += 1
            line = ""
            for word in fields[1].split(' '):
                if word.strip() and word not in stopwords:
                    line += word +'	'
            if len(line) == 0:
                print>>fout, 'null'
            else:
                print>>fout, line
        fin.close()
        fout.close()
        docid_file = output_root_dir +'/'+ source +'/docid.map'
        fout = open(docid_file, 'w')
        for doc in docid:
            print>>fout, "%s	%s"%(doc, docid[doc])
        fout.close()

if __name__ == "__main__":

    main()
allocateDocId

1)如何去除停用词

2)如何为文档分配id,并将记录进行保存

3)在实际情况中,是将当前的文档id,词本身作为key还是进一步处理,其实可以看情况而定。

4, generate_nw_nd

#!/usr/bin/python

import os
import sys

def main():

    root_dir = sys.argv[1]

    cmd = "cat "+ root_dir +"/lda_model/ntopics_alpha.data | awk -F' ' '{print $1}' "
    num_topics = int(os.popen(cmd).read().strip())

    source_list = ['sina', 'tencent', 'tianya']
    for source in source_list:
        tassign_file = root_dir +'/lda_model/'+ source +'/model-final.tassign'
        nd_file = root_dir +'/lda_model/'+ source +'/nd.data'
        cmd = "head -1 "+ root_dir +"/lda_model/"+ source +"/wordmap.txt"
        num_tokens = int(os.popen(cmd).read().strip())
        nw = [0 for i in range(num_topics * num_tokens)]
        fin = open(tassign_file, 'r')
        fout = open(nd_file, 'w')
        docid = 0
        for line in fin:
            fields = line.strip().split(' ')
            nd = [0 for i in range(num_topics)]
            for pair in fields:
                parts   = pair.split(':')
                wordid  = int(parts[0])
                topicid = int(parts[1])
                nw[wordid*num_topics + topicid] += 1
                nd[topicid] += 1
            print>>fout, "%s	%s"%(docid, "	".join([str(i) for i in nd]))
            docid += 1
        fin.close()
        fout.close()
        nw_file = root_dir +'/lda_model/'+ source +'/nw.data'
        fout = open(nw_file, 'w')
        for wordid in range(num_tokens):
            line = ''
            for topicid in range(num_topics):
                line += str(nw[wordid*num_topics + topicid]) +'	'
            print>>fout, line
        fout.close()

if __name__ == "__main__":
    main()
generae_nw_nd

1) use list to do matrix

5,topic_mapping

#!/usr/bin/python

import os
import sys

def similarity(real_vector, lda_vector):

    score = float(0)

    words = set()
    for word in real_vector:
        if word not in words:
            words.add(word)
    for word in lda_vector:
        if word not in words:
            words.add(word)
    
    real_list = []
    lda_list = []
    for word in words:
        if word in real_vector:
            real_list.append(real_vector[word])
        else:
            real_list.append(float(0))
        if word in lda_vector:
            lda_list.append(lda_vector[word])
        else:
            lda_list.append(float(0))
    for i in range(len(real_list)):
        score += real_list[i] * lda_list[i]

    return score

def topic_mapping(realtopic_vector, ldatopic_vector):

    real_lda = {}
    
    for realtopic in realtopic_vector:
        max_topic = '0'
        max_score = float(0)
        for ldatopic in ldatopic_vector:
            score = similarity(realtopic_vector[realtopic], ldatopic_vector[ldatopic])
            if score > max_score:
                max_topic = ldatopic
                max_score = score
        real_lda[realtopic] = max_topic

    return real_lda

def main():

    root_dir = sys.argv[1]
    twords   = int(sys.argv[2])
    realtopic_words = int(sys.argv[3])

    source_list = ['sina', 'tencent', 'tianya']
    
    # generate vsm of real topic
    topicid_file = root_dir +"/result_data/topic_id.data"
    realtopic_vsm = {}
    fin = open(topicid_file, 'r')
    for line in fin:
        fields = line.strip().split('	')
        realtopic_vsm[fields[0]] = {}
    fin.close()
    topic_source_list = ['sina', 'tencent']
    for topic_source in topic_source_list:
        input_file = root_dir +'/result_data/'+ topic_source +'.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            topicid = fields[2]
            if topicid == '-1':
                continue
            for topic in topicid.split(':'):
                if topic not in realtopic_vsm:
                    continue
                for word in fields[1].split(' '):
                    if word not in realtopic_vsm[topic]:
                        realtopic_vsm[topic][word] = 1
                    else:
                        realtopic_vsm[topic][word] += 1
        fin.close()
    # generate vector of real topic
    realtopic_vector = {}
    for topic in realtopic_vsm:
        realtopic_vector[topic] = {}
        length = realtopic_words
        sorted_tmp = sorted(realtopic_vsm[topic].items(), key = lambda d:d[1], reverse=True)
        if len(sorted_tmp) < length:
            length = len(sorted_tmp)
        sum_count = 0
        for i in range(length):
            sum_count += sorted_tmp[i][1]
        for i in range(length):
            realtopic_vector[topic][sorted_tmp[i][0]] = sorted_tmp[i][1] / float(sum_count)

    # mapping real topic with lda topic
    for source in source_list:
        input_file = root_dir +"/lda_model/"+ source +"/model-final.twords"
        # re-build topic vectoc
        ldatopic_vector = {}
        fin = open(input_file, 'r')
        cur_topic = "0"
        for line in fin:
            line = line.strip()
            if line.find('Topic') != -1:
                fields = line.split(' ')
                cur_topic = fields[1][0: fields[1].find('th')]
                ldatopic_vector[cur_topic] = {}
            else:
                fields = line.split('	')
                word = fields[0]
                weight = float(fields[1])
                if weight > 0.0:
                    ldatopic_vector[cur_topic][word] = weight
        fin.close()
        real_lda = topic_mapping(realtopic_vector, ldatopic_vector) 
        output_file = root_dir +"/lda_model/"+ source +"/topic_mapping.data"
        fout = open(output_file, 'w')
        for realtopic in real_lda:
            print>>fout, "%s	%s"%(realtopic, real_lda[realtopic])
        fout.close()
                

if __name__ == "__main__":

    main()
topic_mapping

1)set an real_topic to lda_topic

(the real_topic 's words are by counting;the lda_topic 's word are by training)

2) caculate the similarity of two dictory or two vector

6, final_data

#!/usr/bin/python

import sys

def main():
    root_dir = sys.argv[1]

    topn = 2 # the top n topic is the real distribution of document
    source_list = ['sina', 'tencent', 'tianya']
    for source in source_list:
        allocateid_ldatopic = {} # value is a list
        input_file = root_dir +'/lda_model/'+ source +'/nd.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            allocateid =  fields[0]
            topic_distribution = {}
            for i in range(1, len(fields)-1):
                topic_distribution[i-1] = int(fields[i])
            sorted_tmp = sorted(topic_distribution.items(), key = lambda d:d[1], reverse=True)
            allocateid_ldatopic[allocateid] = []
            for i in range(topn):
                allocateid_ldatopic[allocateid].append(sorted_tmp[i][0])
        fin.close()
        ldatopic_realtopic = {} # value is a list
        input_file = root_dir +'/lda_model/'+ source +'/topic_mapping.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            ldatopic = fields[1]
            realtopic = fields[0]
            if ldatopic not in ldatopic_realtopic:
                ldatopic_realtopic[ldatopic] = [realtopic]
            else:
                ldatopic_realtopic[ldatopic].append(realtopic)
        fin.close()
        userid_profile = {}
        input_file = root_dir +'/result_data/userinfo.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            userid = fields[0]
            sex = fields[1]
            location = fields[2]
            age = fields[3]
            fanscount = fields[5]
            weibocount = fields[6]
            userid_profile[userid] = [sex, location, age, fanscount, weibocount]
        fin.close()
        docid_allocateid = {}
        input_file = root_dir +'/lda_model/'+ source +'/docid.map'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            docid_allocateid[fields[0]] = fields[1]
        fin.close()
        # final.data
        input_file = root_dir +'/result_data/'+ source +'.data'
        output_file = root_dir +'/lda_model/'+ source +'/final.data'
        fin = open(input_file, 'r')
        fout = open(output_file, 'w')
        for line in fin:
            fields = line.strip().split('	')
            docid = fields[0] 
            allocateid = docid_allocateid[docid]
            topic_set = set()
            if fields[2] != '-1':
                for topic in fields[2].split(':'):
                    if topic in topic_set:
                        continue
                    topic_set.add(topic)
            for ldatopic in allocateid_ldatopic[allocateid]:
                if str(ldatopic) not in ldatopic_realtopic:
                    continue
                for topic in ldatopic_realtopic[str(ldatopic)]:
                    if topic not in topic_set:
                        topic_set.add(topic)
            if topic_set:
                topics = ':'.join(topic_set)
            else:
                topics = 'null'
            comment = fields[3]
            retweet = fields[4]
            praise = fields[5]
            userid = fields[6]
            if userid in userid_profile:
                user_profile = '	'.join(userid_profile[userid])
            else:
                user_profile = 'null	null	null	null	null'
            print>>fout, "%s	%s	%s	%s	%s	%s	%s	%s"%(docid, allocateid, topics, comment, retweet, praise, userid, user_profile)
        fin.close()
        fout.close()

if __name__ == "__main__":

    main()
final_data

1) allocate each doc the top2 lda_topic

2) totate a dict's key and value, though the key is unique ,but the same value can project different key

3)allocate a doc some relate topic ,include it's own topic,as well as the top2 lda-projection topic

4)merge the allocateid_ldatopic, ldatopic_realtopic, userid_profile, docid_allocateid to a single file

7,Visualization

#!/usr/bin/python
# -*- coding=utf-8 -*-

import sys
from string import Template

def replace_template(template_file, replaceDict, output_file):
    
    fh = open(template_file, 'r')
    content = fh.read()
    fh.close()
    content_template = Template(content)
    content_final = content_template.safe_substitute(replaceDict)
    
    fout = open(output_file, 'w')
    fout.write(content_final)
    fout.close()

def bar_categories(categories_list):
    categories = "["
    for i in range(len(categories_list)):
        if i == len(categories_list)-1:
            categories += "'"+ categories_list[i] +"']"
        else:
            categories += "'"+ categories_list[i] +"',"
    return categories

def bar_series(data_list):
    series = "[{ name: 'count', data: ["
    for i in range(len(data_list)):
        if i == len(data_list)-1:
            series += str(data_list[i]) +"]}]"
        else:
            series += str(data_list[i]) +","
    return series

def pie_data(data_map):
    data = "["
    index = 0
    for item in data_map:
        if index == len(data_map)-1:
            data += "['"+ str(item) +"',"+ str(data_map[item]) +"]"
        else:
            data += "['"+ str(item) +"',"+ str(data_map[item]) +"],"
    data += "]"
    return data

def main():

    root_dir = sys.argv[1]

    # topicid and topic's content
    topicid_content = {}
    input_file = root_dir +'/result_data/topic_id.data'
    fin = open(input_file, 'r')
    for line in fin:
        fields = line.strip().split('	')
        topicid_content[fields[0]] = fields[1]
    fin.close()

    #1、话题分布
    source_list = ['sina', 'tencent', 'tianya']
    topicid_count = {} 
    for source in source_list:
        input_file = root_dir +'/lda_model/'+ source +'/final.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            if fields[2] == 'null':
                continue
            for topic in fields[2].split(':'):
                if topic in topicid_count:
                    topicid_count[topic] += 1
                else:
                    topicid_count[topic] = 1
    # Total topics sorted by its total count
    sorted_result = sorted(topicid_count.items(), key = lambda d:d[1], reverse=True)
    
    topN = 20
    replaceDict = {}
    replaceDict['title'] = "'话题分布'"
    replaceDict['subtitle'] = "''"
    categories_list = []
    for i in range(topN):
        categories_list.append(topicid_content[ sorted_result[i][0] ])
    replaceDict['categories'] = bar_categories(categories_list)
    
    replaceDict['x_name'] = "'相关微博或帖子条数'"
    
    data_list = []
    for i in range(topN):
        data_list.append(sorted_result[i][1])
    replaceDict['series'] = bar_series(data_list)
    
    template_file = root_dir +'/template/horizontal_bar.tpl'
    output_file = root_dir +'/final_html/1.htm'
    replace_template(template_file, replaceDict, output_file)

            
    
    #2、话题分布变化趋势

    #3、话题关注用户的男女比例
    topN = 10
    topicid_sex = {}
    for i in range(topN):
        topicid_sex[sorted_result[i][0]] = [0, 0]
    source_list = ['sina'] # we only has user profile of sina currently
    for source in source_list:
        input_file = root_dir +'/lda_model/'+ source +'/final.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            if fields[2] == 'null' or fields[7] == 'null':
                continue
            for topic in fields[2].split(':'):
                if topic not in topicid_sex:
                    continue
                if fields[7] == "":
                    topicid_sex[topic][0] += 1
                if fields[7] == "":
                    topicid_sex[topic][1] += 1
        fin.close()
    for i in range(topN):
        template_file = root_dir +'/template/pie.tpl'
        output_file = root_dir +'/final_html/3-'+ str(i) +'.htm'
        replaceDict = {}
        replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户男女比例'"
        sum_count = topicid_sex[sorted_result[i][0]][0] + topicid_sex[sorted_result[i][0]][1]
        sex_map = {}
        sex_map[''] = topicid_sex[sorted_result[i][0]][0] / float(sum_count)
        sex_map[''] = topicid_sex[sorted_result[i][0]][1] / float(sum_count)
        replaceDict['data'] = pie_data(sex_map)
        replace_template(template_file, replaceDict, output_file)


    #4、话题关注用户的地域分布
    topN = 10
    province_conf = root_dir +'/conf/province.list'
    province_list = []
    province_map = {}
    fin = open(province_conf, 'r')
    index = 0
    for line in fin:
        province = line.strip()
        province_list.append(province)
        province_map[province] = index
        index += 1
    fin.close()
    source_list = ['sina']
    topicid_province = {}
    for i in range(topN):
        topicid_province[sorted_result[i][0]] = []
        for j in range(len(province_list)):
            topicid_province[sorted_result[i][0]].append(0)
    for source in source_list:
        input_file = root_dir +'/lda_model/'+ source +'/final.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            if fields[2] == 'null' or fields[8] == 'null':
                continue
            for topic in fields[2].split(':'):
                if topic not in topicid_province:
                    continue
                province_index = int(province_map[fields[8]])
                topicid_province[topic][province_index] += 1
        fin.close()
    for i in range(topN):
        template_file = root_dir +'/template/horizontal_bar.tpl'
        output_file = root_dir +'/final_html/4-'+ str(i) +'.htm'
        replaceDict = {}
        replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户地域分布'"
        replaceDict['subtitle'] = "''"
        replaceDict['x_name'] = "'相关微博或帖子条数'"
        replaceDict['categories'] = bar_categories(province_list)
        replaceDict['series'] = bar_series(topicid_province[sorted_result[i][0]])
        replace_template(template_file, replaceDict, output_file)


    #5、话题关注用户的年龄分布
    topN = 10
    age_list = ['10岁以下', '10-19岁', '20-29岁', '30-39岁', '40-49岁', '50-59岁', '60岁以上']
    source_list = ['sina']
    topicid_age = {}
    for i in range(topN):
        topicid_age[sorted_result[i][0]] = []
        for j in range(len(age_list)):
            topicid_age[sorted_result[i][0]].append(0)
    for source in source_list:
        input_file = root_dir +'/lda_model/'+ source +'/final.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            if fields[2] == 'null' or fields[9] == 'null':
                continue
            for topic in fields[2].split(':'):
                if topic not in topicid_age:
                    continue
                age  = 2013 -int(fields[9])
                if age <= 9:
                    topicid_age[topic][0] += 1
                elif age >= 10 and age <= 19:
                    topicid_age[topic][1] += 1
                elif age >= 20 and age <= 29:
                    topicid_age[topic][2] += 1
                elif age >= 30 and age <= 39:
                    topicid_age[topic][3] += 1
                elif age >= 40 and age <= 49:
                    topicid_age[topic][4] += 1
                elif age >= 50 and age <= 59:
                    topicid_age[topic][5] += 1
                elif age >= 60:
                    topicid_age[topic][6] += 1                    
        fin.close()
    for i in range(topN):
        template_file = root_dir +'/template/vertical_bar.tpl'
        output_file = root_dir +'/final_html/5-'+ str(i) +'.htm'
        replaceDict = {}
        replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户年龄分布'"
        replaceDict['subtitle'] = "''"
        replaceDict['y_name'] = "'人数'"
        replaceDict['categories'] = bar_categories(age_list)
        replaceDict['series'] = bar_series(topicid_age[sorted_result[i][0]])
        replace_template(template_file, replaceDict, output_file)
    

    #6、话题来源媒体的比例
    topN = 10
    source_list = ['sina', 'tencent', 'tianya']
    topicid_source = {}
    for i in range(topN):
        topicid_source[sorted_result[i][0]] = []
        for j in range(len(source_list)):
            topicid_source[sorted_result[i][0]].append(0)
    for source in source_list:
        input_file = root_dir +'/lda_model/'+ source +'/final.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            if fields[2] == 'null':
                continue
            for topic in fields[2].split(':'):
                if topic not in topicid_source:
                    continue
                if source == "sina":
                    topicid_source[topic][0] += 1
                if source == "tencent":
                    topicid_source[topic][1] += 1
                if source == "tianya":
                    topicid_source[topic][2] += 1
        fin.close()
    for i in range(topN):
        template_file = root_dir +'/template/pie.tpl'
        output_file = root_dir +'/final_html/6-'+ str(i) +'.htm'
        replaceDict = {}
        replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"#  话题来源媒体分布'"
        source_map = {}
        source_map['sina'] = topicid_source[sorted_result[i][0]][0]
        source_map['tencent'] = topicid_source[sorted_result[i][0]][1]
        source_map['tianya'] = topicid_source[sorted_result[i][0]][2]
        replaceDict['data'] = pie_data(source_map)
        replace_template(template_file, replaceDict, output_file)

    #7、话题的核心关注用户
    topN = 10
    coreuser = 5
    source_list = ['sina']
    topicid_user = {}
    for i in range(topN):
        topicid_user[sorted_result[i][0]] = {}
    for source in source_list:
        input_file = root_dir +'/lda_model/'+ source +'/final.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            if fields[2] == 'null' or fields[6] == 'null':
                continue
            userid = fields[6]
            for topic in fields[2].split(':'):
                if topic not in topicid_user:
                    continue
                if userid not in topicid_user[topic]:
                    topicid_user[topic][userid] = 1
                else:
                    topicid_user[topic][userid] += 1
        fin.close()
    output_file = root_dir +'/final_html/topic_coreuser.list'
    fout = open(output_file, 'w')
    for i in range(topN):
        title = "#"+ topicid_content[sorted_result[i][0]] +"#  话题核心关注人物"
        print>>fout, title
        sorted_tmp = sorted(topicid_user[sorted_result[i][0]].items(), key = lambda d:d[1], reverse =True)
        if len(sorted_tmp) < coreuser:
            coreuser = len(sorted_tmp)
        for j in range(coreuser):
            print>>fout, "	%s	%s"%(sorted_tmp[j][0], sorted_tmp[j][1]) # userid and related documents count
    fout.close()

    #8、话题关注用户的粉丝数分布
    topN = 10
    fans_list = ['0-100', '101-1000', '1001-10000', '10001-100000', '100001-500000', '500000以上']
    source_list = ['sina']
    topicid_fans = {}
    for i in range(topN):
        topicid_fans[sorted_result[i][0]] = []
        for j in range(len(fans_list)):
            topicid_fans[sorted_result[i][0]].append(0)
    for source in source_list:
        input_file = root_dir +'/lda_model/'+ source +'/final.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            if fields[2] == 'null' or fields[6] == 'null' or fields[10] == 'null':
                continue
            for topic in fields[2].split(':'):
                if topic not in topicid_fans:
                    continue
                fans = int(fields[10])
                if fans <= 100:
                    topicid_fans[topic][0] += 1
                elif fans >= 101 and fans <= 1000:
                    topicid_fans[topic][1] += 1
                elif fans >= 1001 and fans <= 10000:
                    topicid_fans[topic][2] += 1
                elif fans >= 10001 and fans <= 100000:
                    topicid_fans[topic][3] += 1
                elif fans >= 100001 and fans <= 500000:
                    topicid_fans[topic][4] += 1
                elif fans >= 500001:
                    topicid_fans[topic][5] += 1
        fin.close()
    for i in range(topN):
        template_file = root_dir +'/template/horizontal_bar.tpl'
        output_file = root_dir +'/final_html/8-'+ str(i) +'.htm'
        replaceDict = {}
        replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户粉丝数分布'"
        replaceDict['subtitle'] = "''"
        replaceDict['x_name'] = "'粉丝数'"
        replaceDict['categories'] = bar_categories(fans_list)
        replaceDict['series'] = bar_series(topicid_fans[sorted_result[i][0]])
        replace_template(template_file, replaceDict, output_file)

    #9、话题关注用户的微博数分布
    topN = 10
    weibo_list = ['0-100', '101-1000', '1001-3000', '3001-5000', '5001-10000', '10000以上']
    source_list = ['sina']
    topicid_weibo = {}
    for i in range(topN):
        topicid_weibo[sorted_result[i][0]] = []
        for j in range(len(weibo_list)):
            topicid_weibo[sorted_result[i][0]].append(0)
    for source in source_list:
        input_file = root_dir +'/lda_model/'+ source +'/final.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            if fields[2] == 'null' or fields[6] == 'null' or fields[11] == 'null':
                continue
            for topic in fields[2].split(':'):
                if topic not in topicid_weibo:
                    continue
                weibo = int(fields[10])
                if weibo <= 100:
                    topicid_weibo[topic][0] += 1
                elif weibo >= 101 and weibo <= 1000:
                    topicid_weibo[topic][1] += 1
                elif weibo >= 1001 and weibo <= 3000:
                    topicid_weibo[topic][2] += 1
                elif weibo >= 3001 and weibo <= 5000:
                    topicid_weibo[topic][3] += 1
                elif weibo >= 5001 and weibo <= 10000:
                    topicid_weibo[topic][4] += 1
                elif weibo >= 10001:
                    topicid_weibo[topic][5] += 1
        fin.close()
    for i in range(topN):
        template_file = root_dir +'/template/horizontal_bar.tpl'
        output_file = root_dir +'/final_html/9-'+ str(i) +'.htm'
        replaceDict = {}
        replaceDict['title'] = "'#"+ topicid_content[sorted_result[i][0]] +"# 关注用户微博数分布'"
        replaceDict['subtitle'] = "''"
        replaceDict['x_name'] = "'微博数'"
        replaceDict['categories'] = bar_categories(weibo_list)
        replaceDict['series'] = bar_series(topicid_weibo[sorted_result[i][0]])
        replace_template(template_file, replaceDict, output_file)

    #10、关注度、传播度、活跃度
    topN = 10
    source_list = ['sina', 'tencent', 'tianya']
    topicid_attention = {}
    topicid_diffuse   = {}
    topicid_active    = {}
    for i in range(topN):
        topicid_attention[sorted_result[i][0]] = set() #userlist
        topicid_diffuse[sorted_result[i][0]]   = {} #user and fans
        topicid_active[sorted_result[i][0]]    = 0 #comment and retweet and praise
    for source in source_list:
        input_file = root_dir +'/lda_model/'+ source +'/final.data'
        fin = open(input_file, 'r')
        for line in fin:
            fields = line.strip().split('	')
            if fields[2] == 'null':
                continue
            for topic in fields[2].split(':'):
                if topic not in topicid_attention:
                    continue
                if fields[6] != 'null':
                    if fields[6] not in topicid_attention[topic]:
                        topicid_attention[topic].add(fields[6])
                    if fields[10] != 'null':
                        if fields[6] not in topicid_diffuse[topic]:
                            topicid_diffuse[topic][fields[6]] = int(fields[10])
                if fields[3] != 'null':
                    topicid_active[topic] += int(fields[3])
                if fields[4] != 'null':
                    topicid_active[topic] += int(fields[4])
                if fields[5] != 'null':
                    topicid_active[topic] += int(fields[5])
        fin.close()
    output_file = root_dir +'/final_html/topic_attention_diffuse_active.list'
    fout = open(output_file, 'w')
    for i in range(topN):
        title = "#"+ topicid_content[sorted_result[i][0]] +"#  关注度、传播度、活跃度"
        print>>fout, title
        attention = len(topicid_attention[sorted_result[i][0]])
        diffuse = 0 
        for user in topicid_diffuse[sorted_result[i][0]]:
            diffuse += topicid_diffuse[sorted_result[i][0]][user]
        active = topicid_active[sorted_result[i][0]]
        print>>fout, "	%s	%s	%s"%(attention, diffuse, active)
    fout.close()
    

if __name__ == "__main__":
    main()
visualization
原文地址:https://www.cnblogs.com/bobodeboke/p/3500924.html