03.描述性统计代码

  1 from collections import Counter
  2 from math import sqrt
  3 
  4 def frequency(data):
  5     """频率"""
  6     counter = Counter(data)
  7     ret = []
  8     for point in counter.most_common():
  9         ret.append((point[0], point[1] / len(data)))
 10     return ret
 11 
 12 
 13 def mode(data):
 14     """众数"""
 15     counter = Counter(data)
 16     if counter.most_common()[0][1] == 1:
 17         return None, None
 18 
 19     count = counter.most_common()[0][1]
 20     ret = []
 21     for point in counter.most_common():
 22         if point[1] == count:
 23             ret.append(point[0])
 24         else:
 25             break
 26     return ret, count
 27 
 28 
 29 def median(data):
 30     """中位数"""
 31     sorted_data = sorted(data)
 32     n = len(sorted_data)
 33 
 34     if n % 2 == 1:
 35         return sorted_data[n // 2]
 36 
 37     return (sorted_data[n // 2 -1] + sorted_data[n // 2]) / 2
 38 
 39 
 40 def mean(data):
 41     """均值"""
 42     return sum(data) / len(data)
 43 
 44 
 45 def rng(data):
 46     """极差"""
 47     return max(data) - min(data)
 48 
 49 
 50 def quartile(data):
 51     """四分位数"""
 52     n = len(data)
 53     q1, q2, q3 = None, None, None
 54     if n >= 4:
 55         sorted_data = sorted(data)
 56         q2 = median(sorted_data)
 57         if n % 2 == 1:
 58             q1 = median(sorted_data[:n // 2])
 59             q3 = median(sorted_data[n // 2 + 1:])
 60         else:
 61             q1 = median(sorted_data[:n // 2])
 62             q3 = median(sorted_data[n // 2:])
 63 
 64     return q1, q2, q3
 65 
 66 
 67 def variance(data):
 68     """方差"""
 69     n = len(data)
 70     if n <= 1:
 71         return None
 72 
 73     mean_value = mean(data)
 74     return sum((e - mean_value) ** 2 for e in data) / (n - 1)
 75 
 76 
 77 def std(data):
 78     """标准差"""
 79     return sqrt(variance(data))

作图:

import matplotlib.pyplot as plt
import random
from collections import Counter

if __name__ == "__main__":

    # scatter plot
    random.seed(666)
    x = [random.randint(0,100) for _ in range(100)]
    y = [random.randint(0,100) for _ in range(100)]
    plt.scatter(x, y)
    plt.show()

    # line plot
    x = [random.randint(0, 100) for _ in range(100)]
    plt.plot([i for i in range(100)], x)
    plt.show()

    # bar plot
    data = [3, 3, 4, 1, 5, 4, 2, 1, 5, 4, 4, 4, 5, 3, 2, 1, 4, 5, 5]
    counter = Counter(data)
    x = [point[0] for point in counter.most_common()]
    y = [point[1] for point in counter.most_common()]
    plt.bar(x, y)
    plt.show()

    # histogram
    data = [random.randint(1, 100) for _ in range(1000)]
    plt.hist(data, rwidth = 0.8, bins = 5, density = True)
    plt.show()

    # boxplot
    data = [random.randint(1, 100) for _ in range(1000)]
    data.append(200)
    data.append(-200)
    plt.boxplot(data)
    plt.show()

    data1 = [random.randint(66, 166) for _ in range(200)]
    data2 = [random.randint(60, 120) for _ in range(200)]
    plt.boxplot([data1, data2])
    plt.show()
原文地址:https://www.cnblogs.com/waterr/p/14136271.html