Python Data Visualization Cookbook 2.9.2

 1 import numpy as np
 2 import matplotlib.pyplot as plt
 3 
 4 
 5 def is_outlier(points, threshold=3.5):
 6     if len(points.shape) == 1:
 7         points = points[:, None]
 8 
 9     # Find the median number of points
10     median = np.median(points, axis=0)
11 
12     diff = np.sum((points - median)**2, axis=-1)
13     diff = np.sqrt(diff)
14     MAD = np.median(diff)
15 
16     MZS = 0.6745 * diff / MAD
17 
18     return MZS > threshold
19 
20 # Create 100 random numbers
21 x = np.random.random(100)
22 
23 # The number of the histogram buckets
24 buckets = 50
25 
26 # Add in a few outliers
27 x = np.r_[x, -49, 95, 100, -100]
28 
29 # The function 'is_outlier()' return a array of boolean
30 # If True, get the element; else pass the element
31 # For example:
32 # x = [1,2,3,4]
33 # y = x[array([False,True,True,False])]
34 # y is [2,3]
35 filtered = x[~is_outlier(x)]
36 
37 # Create a new figure
38 plt.figure()
39 
40 # Define the width of the figure
41 plt.subplot(211)
42 # Drawing histogram
43 # histogram(arr,bins,normed,facecolor,edgecolor,alpha,histtype)
44 plt.hist(x, buckets)
45 plt.xlabel('Raw')
46 
47 plt.subplot(212)
48 plt.hist(filtered, buckets)
49 plt.xlabel('Cleaned')
50 
51 # Show the figure
52 plt.show()