python学习

1、包的导入

不建议采用如下写法，这样做会破坏命名空间：

from numpy import *

应该这样写：

import numpy as np

2、numpy 关于赋值

普通的赋值，如b=a，这时，a和b是一个对象，改变了a也就改变了b，并不是将a的值复制一份，赋值给b；

这样做的好处是优化代码效率；numpy尽量避免复制

3、numpy修剪函数

clip

4、绘制散点图

#!/usr/bin/python
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np

#散点图
X=np.array([1,2,3,4])
X_EN=np.array(['one','two','three','four'])
Y=np.array([2,4,6,8])
plt.scatter(X,Y)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("study")
plt.xticks(X,X_EN)
plt.grid()
plt.show()

绘制散点图

5、可以把函数作为参数传入

如：def add(x,y,f):
... return f(x)+f(y)

调用：add(-3,-4,abs)

6、区分花卉（1）

#!/usr/bin/python
# -*- coding: utf-8 -*-
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
import numpy as np
data = load_iris()
features = data['data']
target = data['target']
#先可视化其中两个特征的关系setosa=0; versicolor=1; virginica=2
for t,marker,color in zip(xrange(3),">ox","rgb"):
    plt.scatter(features[target==t,0],features[target==t,3],marker=marker,c=color)
plt.show()

plength=features[:,2]
is_setosa=(target==0)
max_setosa = plength[is_setosa].max()#setosa的最大长度
min_no_setosa=plength[~is_setosa].min()#其他两种的最小长度
print max_setosa,' ',min_no_setosa

#区分其他两种

区分花卉

7、线性回归

#!/usr/bin/python
# -*- coding: utf-8 -*-
import numpy
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation

#准备数据集
Xtrain=numpy.array([[1],[2],[3],[6],[7],[8]])
Ytrain=numpy.array([[1],[2],[3],[4],[7],[8]])
#训练模型
linreg=LinearRegression()
linreg.fit(Xtrain,Ytrain)#二维数组

print linreg.intercept_#截距
print linreg.coef_#系数

#预测
y_pre=linreg.predict(9)
print '预测X=9 Y=',y_pre

#模型评价 计算均方差（差的平方，求和，再求平均值）
print 'MSE',metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain)
#模型评价 计算均方根误差
print 'RMSE',numpy.sqrt(metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain))
#交叉验证
x_train,y_train,x_test,y_test=cross_validation.train_test_split(Xtrain,Ytrain,test_size=0.3,random_state=0)
linreg.fit(x_train,y_train)
print '交叉验证结果：'

View Code

#!/usr/bin/python
# -*- coding: utf-8 -*-
import numpy
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation

#准备数据集
Xtrain=numpy.array([[1,2],[2,2],[3,3],[6,5],[7,4],[8,9]])
Ytrain=numpy.array([[3],[4],[6],[11],[11],[17]])
#训练模型
linreg=LinearRegression()
linreg.fit(Xtrain,Ytrain)#二维数组

print linreg.intercept_#截距
print linreg.coef_#系数

#预测
y_pre=linreg.predict([9,8])
print '预测X=9 Y=',y_pre

#模型评价 计算均方差（差的平方，求和，再求平均值）
print 'MSE',metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain)
#模型评价 计算均方根误差
print 'RMSE',numpy.sqrt(metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain))
#交叉验证

多维输入

8、TF-IDF算法

import scipy as sp
import math
def tfIdf(term,doc,docset):
    tf=float(doc.count(term))/sum(a_doc.count(term) for a_doc in docset)
    idf=math.log(float(len(docset))/len([a_doc for a_doc in docset if term in a_doc]))
    return tf*idf

a,abb,abc=['a'],['a','b','b'],['a','b','c']
D=[a,abb,abc]

print tfIdf('a',a,D)
print tfIdf('b',a,D)
print tfIdf('a',abb,D)
print tfIdf('b',abb,D)
print tfIdf('b',abc,D)

TF-IDF

9、K均值分类

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans
# X为样本特征，Y为样本簇类别， 共1000个样本，每个样本2个特征，共4个簇，簇中心在[-1,-1], [0,0],[1,1], [2,2]， 簇方差分别为[0.4, 0.2, 0.2]
X, y = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [0,0], [1,1], [2,2]], cluster_std=[0.4, 0.2, 0.2, 0.2],
                  random_state =16)
#plt.scatter(X[:, 0], X[:, 1], marker='o')
#plt.show()

model = KMeans(n_clusters=6,random_state=9).fit(X);
pre_y=model.predict(X)
X1=X[pre_y==0]
X2=X[pre_y==1]
X3=X[pre_y==2]
X4=X[pre_y==3]
X5=X[pre_y==4]
X6=X[pre_y==5]
plt.scatter(X1[:, 0], X1[:, 1], marker='o')
plt.scatter(X2[:, 0], X2[:, 1], marker='*',color="r")
plt.scatter(X3[:, 0], X3[:, 1], marker='+')
plt.scatter(X4[:, 0], X4[:, 1], marker='.')
plt.scatter(X5[:, 0], X5[:, 1], marker='>')
plt.scatter(X6[:, 0], X6[:, 1], marker='^',color="g")
plt.show()

K-Means

10、潜在狄利克雷主题模型

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from gensim import corpora,models,similarities
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

# 去除停用词并分词
# 译者注：这里只是例子，实际上还有其他停用词
#         处理中文时，请借助 Py结巴分词 https://github.com/fxsjy/jieba
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# 去除仅出现一次的单词
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

from pprint import pprint   # pretty-printer
pprint(texts)

dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict') # 把字典保存起来，方便以后使用
print(dictionary)
#查看每个词对应的编号
print 'id2word:',dictionary.token2id
#得到语料库
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # 存入硬盘，以备后需
print corpus
#建立潜在狄利克雷模型
model = models.ldamodel.LdaModel(corpus,num_topics=2,id2word=dictionary)
print 'modele:',model
topics=[model[c] for c in corpus]
print 'topic0',topics[0]
#未完待续

LDA主题模型

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import jieba.analyse
import sys

reload(sys)
sys.setdefaultencoding('utf-8')


def pre_detail():
    jieba.analyse.set_stop_words("./Data/stopWords")
    with open("./Data/push_candidate_10w") as f:
        documents = f.readlines()
        with open("./Data/nlp_test", mode='w') as f2:
            for document in documents:
                f2.write(" ".join(jieba.analyse.extract_tags(document, topK=None)) + "
")


if __name__ == '__main__':
    print ("LDA 主题模型")
    with open("./Data/nlp_test", mode='rw') as train_file:
        words = train_file.readlines()
        cntVector = CountVectorizer()
        cntTf = cntVector.fit_transform(words)  # 词频向量
        lda = LatentDirichletAllocation(n_topics=100)
        theme = lda.fit_transform(cntTf)
        paras = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
                 '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
                 '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
                 '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
                 '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
        with open("./Data/tmp", mode='rw') as result_file:
            push = result_file.readlines()
            np.set_printoptions(threshold=np.nan)
            max_p = np.max(theme, axis=1)
            for i in range(0, max_p.size):
                paras[np.where(theme[i] == max_p[i])[0][0]] += str(push[i]).decode("utf-8")
        print str(paras).decode("unicode-escape").replace("', u", '
----------------------------------'
                                                                  '-----------------------------------
')

LDA 实现方式2

11、对象保存与加载

    @staticmethod
    def save_obj(obj,file_name):
        with open(file_name,'wb') as f:
            pickle.dump(obj,f)

    @staticmethod
    def load_obj(file_name):
        with open(file_name,'rb') as f:
            return pickle.load(f)

对象保存与加载

12、K邻近算法参数

　　n_neighbors - 选取最近的几个点

　　weights - 各个点的比重，可选：uniform 各点比重相同、distance 按照距离赋予权重、[callable]函数

　　algorithm - 寻找最邻近使用的算法

import json
import jieba
import math
from MsgIdentification.DataProcess import DataProcess
from sklearn.neighbors import KNeighborsClassifier
class CompositeModel:

    def __init__(self):
        super().__init__()
        self.X = DataProcess.load_obj("./data/model.pkl")
        self.labelList = DataProcess.load_obj("./data/labelList.pkl")
        self.vectorizer = DataProcess.load_obj("./data/vectorizer.pkl")
        self.knn = KNeighborsClassifier(n_neighbors=4,weights="distance")
        self.knn.fit(self.X,self.labelList)

    def predict(self, msgInfo: object) -> object:
        in_json = json.loads(msgInfo)
        msg = in_json['body']
        #if(len(DataProcess.chinese_reg.findall(msg))<16):
        #    return False
        #length = math.pow(len(DataProcess.chinese_reg.findall(msg)), 1) / math.pow(len(msg),1.5)
        #if length > 0.12 or length < 0.000001:
        #    return False
        #if len(DataProcess.rubish_reg.findall(msg)):
        #    return True

        seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False))
        dellist = []
        for word in seg_list:
            if (len(word) == 1):
                dellist.append(word)
        for word in dellist:
            seg_list.remove(word)
        new_sentence = " ".join(seg_list)
        new_vec = self.vectorizer.transform([new_sentence])
        return self.knn.predict(new_vec)

K邻近算法

K邻近算法测试：

from MsgIdentification.CompositeModel import CompositeModel, DataProcess
import json
import datetime

if __name__ == '__main__':
    # file = open("./data/data")
    # msgInfos = file.readlines()
    data_train = DataProcess();
    data_train.dataToVector()
    msgInfos = DataProcess.load_obj("./data/test_data.pkl");
    model = CompositeModel()
    i = 0.02;
    right = 0;
    effect_identify = 0
    critical_error = 0
    false_count = 0.01
    for msgInfo in msgInfos:
        in_json = json.loads(msgInfo)
        result = model.predict(msgInfo)
        label = in_json['spam']

        i += 1
        if not label:
            false_count += 1

        if result == label:
            right += 1
            if result:
                effect_identify += 1

        if result and label == False:
            critical_error += 1

    print(right * 1.0 / i, " -- ", critical_error * 1.0 / false_count, " -- ", effect_identify * 1.0 / i, " -- ",
          effect_identify * 1.0 / (i - false_count))

K邻近测试

13、交叉验证使用的包

　　from sklearn import model_selection

　　或

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from sklearn.model_selection import KFold
import numpy as np

cv = KFold(n_splits=3, shuffle=True)
X = np.asarray([10, 22, 32, 41, 35, 46, 57, 18, 59])
for train, test in cv.split(X):
    print(X[train])
    print(X[test])
    print('=========')

交叉验证

　　等份2^n份的函数：

def dataset_split_n(X, Y, n_fold):
    """
    将X Y 分成 2^n_fold等份
    :param X:
    :param Y:
    :param n_fold:
    :return:
    """
    parent_x = []
    parent_y = []
    child_x = []
    child_y = []
    parent_x.append(X)
    parent_y.append(Y)
    for n in range(0, n_fold):
        child_x = []
        child_y = []
        for index in range(0, len(parent_x)):
            tmp_x1, tmp_x2, tmp_y1, tmp_y2 = train_test_split(parent_x[index], parent_y[index], test_size=0.5)
            child_x.append(tmp_x1)
            child_x.append(tmp_x2)
            child_y.append(tmp_y1)
            child_y.append(tmp_y2)
        parent_x = child_x
        parent_y = child_y
    return child_x, child_y

View Code

14、贝叶斯模型及测试

from sklearn.naive_bayes import MultinomialNB
from MsgIdentification.DataProcess import DataProcess
import json
import jieba
import math


class Bayes:
    def __init__(self):
        super().__init__()
        self.X = DataProcess.load_obj("./data/model.pkl")
        self.labelList = DataProcess.load_obj("./data/labelList.pkl")
        self.vectorizer = DataProcess.load_obj("./data/vectorizer.pkl")
        self.clf = MultinomialNB()
        self.clf.fit(self.X, self.labelList)

    def predict(self, msgInfo):
        in_json = json.loads(msgInfo)
        msg = in_json['body']

        seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False))
        dellist = []
        for word in seg_list:
            if len(word) == 1:
                dellist.append(word)
        for word in dellist:
            seg_list.remove(word)
        new_sentence = " ".join(seg_list)
        new_vec = self.vectorizer.transform([new_sentence])
        if len(DataProcess.chinese_reg.findall(msg)) < 16:
            return False
        length = math.pow(len(DataProcess.chinese_reg.findall(msg)), 1) / math.pow(len(msg), 1.5)
        if length > 0.12 or length < 0.000001:
            return False
        if len(DataProcess.rubish_reg.findall(msg)):
            return True
        return self.clf.predict(new_vec)


if __name__ == '__main__':
    #dataProcess = DataProcess()
    #dataProcess.dataToVector()
    bayes = Bayes()
    msgInfos = DataProcess.load_obj("./data/test_data.pkl")
    i = 0.02;
    right = 0;
    effect_identify = 0
    critical_error = 0
    false_count = 0.01
    for msgInfo in msgInfos:
        result = bayes.predict(msgInfo)
        in_json = json.loads(msgInfo)
        label = in_json['spam']
        i += 1
        if not label:
            false_count += 1

        if result == label:
            right += 1
            if result:
                effect_identify += 1

        if result and label == False:
            critical_error += 1

    print(right * 1.0 / i, " -- ", critical_error * 1.0 / false_count, " -- ", effect_identify * 1.0 / i, " -- ",
          effect_identify * 1.0 / (i - false_count))

贝叶斯

15、决策树模型及测试

　　决策树大概分为三种类型：ID3 基于信息熵和信息增益、C4.5 基于信息增益率、基于GINI（基尼）系数的

from sklearn import tree
from MsgIdentification.DataProcess import DataProcess
import json
import math
import jieba
import pydotplus
import graphviz


class DecisionTree:
    def __init__(self):
        super().__init__()
        self.clf = tree.DecisionTreeClassifier(max_depth=100, min_samples_split=5, min_samples_leaf=3)
        self.X = DataProcess.load_obj("./data/model.pkl")
        self.labelList = DataProcess.load_obj("./data/labelList.pkl")
        self.vectorizer = DataProcess.load_obj("./data/vectorizer.pkl")
        self.clf.fit(self.X, self.labelList)

    def predict(self, msgInfo):
        json_temp = json.loads(msgInfo)
        msg = json_temp['body']

        seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False))
        dellist = []
        for word in seg_list:
            if len(word) == 1:
                dellist.append(word)
        for word in dellist:
            seg_list.remove(word)
        new_sentence = " ".join(seg_list)
        new_vec = self.vectorizer.transform([new_sentence])
        if len(DataProcess.chinese_reg.findall(msg)) < 16:
            return False
        length = math.pow(len(DataProcess.chinese_reg.findall(msg)), 1) / math.pow(len(msg), 1.5)
        if length > 0.12 or length < 0.000001:
            return False
        if len(DataProcess.rubish_reg.findall(msg)):
            return True
        return self.clf.predict(new_vec)

    def save_pdf(self):
        dot_data = tree.export_graphviz(self.clf, out_file=None)
        graph = pydotplus.graph_from_dot_data(dot_data)
        graph.write_pdf("./data/decisionTree.pdf")


if __name__ == '__main__':
    decisionTree = DecisionTree()
    decisionTree.save_pdf()
    msgInfos = DataProcess.load_obj("./data/test_data.pkl")
    i = 0.02;
    right = 0;
    effect_identify = 0
    critical_error = 0
    false_count = 0.01
    for msgInfo in msgInfos:
        result = decisionTree.predict(msgInfo)
        in_json = json.loads(msgInfo)
        label = in_json['spam']
        i += 1
        if not label:
            false_count += 1

        if result == label:
            right += 1
            if result:
                effect_identify += 1

        if result and label == False:
            critical_error += 1

    print(right * 1.0 / i, " -- ", critical_error * 1.0 / false_count, " -- ", effect_identify * 1.0 / i, " -- ",
          effect_identify * 1.0 / (i - false_count))

决策树

16、基于tensorflow的神经网络

import tensorflow as tf
import numpy as np
from MsgIdentification.DataProcess import DataProcess
import json
import jieba


def add_layer(inputs, in_size, out_size, activation_function=None):
    # add one more layer and return the output of this layer
    Weights = tf.Variable(tf.random_normal([in_size, out_size]))
    biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
    Wx_plus_b = tf.matmul(inputs, Weights) + biases
    if activation_function is None:
        outputs = Wx_plus_b
    else:
        outputs = activation_function(Wx_plus_b)
    return outputs


# 2.定义节点准备接收数据 # define placeholder for inputs to network
xs = tf.placeholder(tf.float32, [None, 3729])
ys = tf.placeholder(tf.float32, [None, 1])

l1 = add_layer(xs, 3729, 50, activation_function=tf.nn.relu)
#l2 = add_layer(l1, 50, 10, activation_function=None)
prediction = add_layer(l1, 50, 1, activation_function=None)
loss = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction),
                                    reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
init = tf.initialize_all_variables()
saver = tf.train.Saver()
sess = tf.Session()

# 上面定义的都没有运算，直到 sess.run 才会开始运算
sess.run(init)
x = DataProcess.load_obj("./data/model.pkl")
label_list = DataProcess.load_obj("./data/labelList.pkl")
x_data = np.asarray(x.toarray())
y_data = np.asarray(label_list).reshape(444, 1)

saver.restore(sess, "./data/neuralNetwork.ckpt")

#for i in range(10000000):
#    # training train_step 和 loss 都是由 placeholder 定义的运算，所以这里要用 feed 传入参数
#    sess.run(train_step, feed_dict={xs: x_data, ys: y_data})
#    if i % 10 == 0:  # to see the step improvement
#        now_loss = sess.run(loss, feed_dict={xs: x_data, ys: y_data})
#        print(now_loss)
#        if now_loss < 0.0001:
#            break
#    if i % 1000 == 0:
#        save_path = saver.save(sess, "./data/neuralNetwork.ckpt")
#        print("Save to path: ", save_path)

msgInfos = DataProcess.load_obj("./data/test_data.pkl");
vectorizer = DataProcess.load_obj("./data/vectorizer.pkl")
i = 0
for i in range(0, len(msgInfos) - 2, 50):
    msgInfo = msgInfos[i]
    in_json = json.loads(msgInfo)
    msg = in_json['body']
    label = in_json['spam']
    seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False))
    dellist = []
    for word in seg_list:
        if (len(word) == 1):
            dellist.append(word)
    for word in dellist:
        seg_list.remove(word)
    new_sentence = " ".join(seg_list)
    new_vec = vectorizer.transform([new_sentence])
    print(sess.run(prediction, feed_dict={xs: np.asarray(new_vec.toarray())}) > 0.4, label)

神经网络