第五周学习总结

1.学会了Python的文本聚类:大致步骤是分词、获取词频矩阵和词袋、进行聚类

import codecs
import pandas as pd
data=pd.read_csv("scien1.csv")

col=data.iloc[:,1]
arrs=col.values
import jieba
stopwords = {}.fromkeys(['', '', '', '', '', '非常'])  # 精确模式
print(u"
中文分此后结果:")
curpus=[]
for a in arrs:
    seglist=jieba.cut(a,cut_all=False)
    final = ''
    for seg in seglist:
        if seg not in stopwords:
            final=final+seg
    seg_list=jieba.cut(final,cut_all=False)
    output=' '.join(list(seg_list))
    curpus.append(output)

#%%

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#将文本中的词语转换为词频矩阵
vectorizer=CountVectorizer(curpus)
#计算每个词语出现的次数
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(curpus))
x=vectorizer.fit_transform(curpus)
#获取词袋中所有文本关键词
word=vectorizer.get_feature_names()
print('')
docs_matrix = pd.np.array(x.toarray())
print(docs_matrix)
weight = tfidf.toarray()

resName = "BaiduTfidf_Result.txt"
result = codecs.open(resName, 'w', 'utf-8')
for j in range(len(word)):
    result.write(word[j] + ' ')
    print(word[j] + ' ')
result.write('

')

#%%

for i in range(len(weight)):
    for j in range(len(word)):
        result.write(str(weight[i][j]) + ' ')
result.write('

')

#%%

from sklearn.cluster import KMeans
clf = KMeans(n_clusters=20)
s = clf.fit(weight)
print(s)
print(clf.cluster_centers_)
print(clf.labels_)
i = 1
number=[]
while i <= len(clf.labels_):
    print (i, clf.labels_[i-1])
    number.append(clf.labels_[i-1])
    i = i + 1
print(number)
    #用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数
#print(clf.inertia_)

#%%

lable = []       #存储408个类标 20个类
content = []
first=pd.read_csv("scien1.csv")
first['julei']=number

#%%

first.head()
asdf=first.to_csv("result.csv")

2.GL关系图的使用

<%@ page language="java" contentType="text/html; charset=UTF-8"
    pageEncoding="UTF-8"%>
<!DOCTYPE html>
<html style="height: 100%">
<head>
<meta charset="UTF-8">
<title>Insert title here</title>
</head>
<body style="height: 100%; margin: 0">
       <div id="container" style="height: 100%"></div>
       <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts/dist/echarts.min.js"></script>
       <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts-gl/dist/echarts-gl.min.js"></script>
       <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts-stat/dist/ecStat.min.js"></script>
       <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts/dist/extension/dataTool.min.js"></script>
       <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts/map/js/china.js"></script>
       <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts/map/js/world.js"></script>
       <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/echarts/dist/extension/bmap.min.js"></script>
       <script type="text/javascript" src="assets/js/jquery-3.3.1.js"></script>
       <script type="text/javascript">
var dom = document.getElementById("container");
var myChart = echarts.init(dom);
var app = {};
option = null;
$.when(
    $.getJSON("data/b.json"),
    $.getScript("data/graph-modularity.js")
).done(function (res) {
    var data = res[0];
    var nodes = data.nodes.map(function (nodeName, idx) {
        return {
            name: nodeName,
            value: data.dependentsCount[idx]
        }
    });
    var edges = [];
    for (var i = 0; i < data.edges.length;) {
        var s = data.edges[i++];
        var t = data.edges[i++];
        edges.push({
            source: s,
            target: t
        });
    }

    nodes.forEach(function (node) {
        // if (node.value > 100) {
            node.emphasis = {
                label: {
                    show: true
                }
            }
        // }
        if (node.value > 5000) {
            node.label = {
                show: true
            }
        }
    });

    myChart.setOption({
        backgroundColor: '#000',
        series: [{
            color: ["rgb(203,239,15)", "rgb(73,15,239)","rgb(15,217,239)","rgb(30,15,239)","rgb(15,174,239)","rgb(116,239,15)","rgb(239,15,58)","rgb(15,239,174)","rgb(239,102,15)","rgb(239,15,15)","rgb(15,44,239)","rgb(239,145,15)","rgb(30,239,15)","rgb(239,188,15)","rgb(159,239,15)","rgb(159,15,239)","rgb(15,239,44)","rgb(15,239,87)","rgb(15,239,217)","rgb(203,15,239)","rgb(239,15,188)","rgb(239,15,102)","rgb(239,58,15)","rgb(239,15,145)","rgb(116,15,239)","rgb(15,131,239)","rgb(73,239,15)","rgb(15,239,131)","rgb(15,87,239)","rgb(239,15,231)"],
            type: 'graphGL',
            nodes: nodes,
            edges: edges,
            modularity: {
                resolution: 2,
                sort: true
            },
            lineStyle: {
                color: 'rgba(255,255,255,1)',
                opacity: 0.05
            },
            itemStyle: {
                opacity: 1,
                // borderColor: '#fff',
                // borderWidth: 1
            },
            focusNodeAdjacency: false,
            focusNodeAdjacencyOn: 'click',
            symbolSize: function (value) {
                return Math.sqrt(value / 10);
            },
            label: {
                textStyle: {
                    color: '#fff'
                }
            },
            emphasis: {
                label: {
                    show: false
                },
                lineStyle: {
                    opacity: 0.5,
                     4
                }
            },
            forceAtlas2: {
                steps: 5,
                stopThreshold: 20,
                jitterTolerence: 10,
                edgeWeight: [0.2, 1],
                gravity: 5,
                edgeWeightInfluence: 0,
                // preventOverlap: true
            }
        }]
    });
});
;
if (option && typeof option === "object") {
    myChart.setOption(option, true);
}
       </script>
</body>
</html>
原文地址:https://www.cnblogs.com/liujinxin123/p/12548283.html