Spark机器学习读书笔记-CH03

3.1.获取数据:

wget http://files.grouplens.org/datasets/movielens/ml-100k.zip

3.2.探索与可视化数据:

In [3]: user_data=sc.textFile("file:///root/studio/MachineLearningWithSpark/ch03/ml-100k/u.user")

In [4]: user_data.first()

Out[4]: u'1|24|M|technician|85711'

In [5]: user_fields=user_data.map(lambda line: line.split("|"))

In [8]: num_users = user_fields.map(lambda fields: fields[0]).count()

In [10]: num_genders=user_fields.map(lambda fields: fields[2]).distinct().count()

In [11]: num_occupations=user_fields.map(lambda fields: fields[3]).distinct().count()

In [12]: num_zIpcodes=user_fields.map(lambda fields: fields[4]).distinct().count()

In [16]: print "Users: %d, genders: %d, occupations: %d, zip codes: %d" %(num_users, num_genders, num_occupations, num_zipcodes)
Users: 943, genders: 2, occupations: 21, zip codes: 795

In [17]: ages = user_fields.map(lambda x: int(x[1])).collect()

In [18]: hist(ages, bins=20, color='lightblue', normed=True)
Out[18]:
(array([ 0.00064269, 0.00192808, 0.00449886, 0.0279572 , 0.02956393,
0.03374144, 0.04563129, 0.02538642, 0.02088756, 0.01863813,
0.02088756, 0.01606735, 0.0170314 , 0.01863813, 0.00674829,
0.00482021, 0.0054629 , 0.00192808, 0.00128539, 0.00128539]),
array([ 7. , 10.3, 13.6, 16.9, 20.2, 23.5, 26.8, 30.1, 33.4,
36.7, 40. , 43.3, 46.6, 49.9, 53.2, 56.5, 59.8, 63.1,
66.4, 69.7, 73. ]),
<a list of 20 Patch objects>)

n [19]: fig = matplotlib.pyplot.gcf()

In [20]: fig.set_size_inches(16, 10)

In [23]: count_by_occupation = user_fields.map(lambda fields: (fields[3], 1)).reduceByKey(lambda x, y: x + y).collect()

In [24]: import numpy as np

In [25]: x_axis1 = np.array([c[0] for c in count_by_occupation])

In [26]: y_axis1 = np.array([c[1] for c in count_by_occupation])

In [27]: x_axis = x_axis1[np.argsort(x_axis1)]

In [28]: y_axis = y_axis1[np.argsort(y_axis1)]

In [29]: pos = np.arange(len(x_axis))

In [30]: width = 1.0

In [31]: ax = plt.axes()

In [32]: ax.set_xticks(pos + (width / 2))
Out[32]:
[<matplotlib.axis.XTick at 0x7f1257bc6f50>,
<matplotlib.axis.XTick at 0x7f1257bc6a10>,
<matplotlib.axis.XTick at 0x7f1256fa2050>,
<matplotlib.axis.XTick at 0x7f1256fa2910>,
<matplotlib.axis.XTick at 0x7f1256fbe090>,
<matplotlib.axis.XTick at 0x7f1256fbe7d0>,
<matplotlib.axis.XTick at 0x7f1256fbef10>,
<matplotlib.axis.XTick at 0x7f1256fc9690>,
<matplotlib.axis.XTick at 0x7f1256fc9dd0>,
<matplotlib.axis.XTick at 0x7f124e6033d0>,
<matplotlib.axis.XTick at 0x7f1257b604d0>,
<matplotlib.axis.XTick at 0x7f124e603c90>,
<matplotlib.axis.XTick at 0x7f1257b602d0>,
<matplotlib.axis.XTick at 0x7f1257b60d90>,
<matplotlib.axis.XTick at 0x7f124e60f510>,
<matplotlib.axis.XTick at 0x7f124e60fc50>,
<matplotlib.axis.XTick at 0x7f124e6183d0>,
<matplotlib.axis.XTick at 0x7f124e618b10>,
<matplotlib.axis.XTick at 0x7f124e623290>,
<matplotlib.axis.XTick at 0x7f124e6239d0>,
<matplotlib.axis.XTick at 0x7f121c583150>]

In [34]: ax.set_xticklabels(x_axis)
Out[34]:
[<matplotlib.text.Text at 0x7f1257bc6410>,
<matplotlib.text.Text at 0x7f1257b68350>,
<matplotlib.text.Text at 0x7f1256fa2790>,
<matplotlib.text.Text at 0x7f1256fa2ed0>,
<matplotlib.text.Text at 0x7f1256fbe650>,
<matplotlib.text.Text at 0x7f1256fbed90>,
<matplotlib.text.Text at 0x7f1256fc9510>,
<matplotlib.text.Text at 0x7f1256fc9c50>,
<matplotlib.text.Text at 0x7f1256fd23d0>,
<matplotlib.text.Text at 0x7f1257c29ad0>,
<matplotlib.text.Text at 0x7f124e603f10>,
<matplotlib.text.Text at 0x7f1257b60510>,
<matplotlib.text.Text at 0x7f1257b60c10>,
<matplotlib.text.Text at 0x7f124e60f390>,
<matplotlib.text.Text at 0x7f124e60fad0>,
<matplotlib.text.Text at 0x7f124e618250>,
<matplotlib.text.Text at 0x7f124e618990>,
<matplotlib.text.Text at 0x7f124e623110>,
<matplotlib.text.Text at 0x7f124e623850>,
<matplotlib.text.Text at 0x7f124e623f90>,
<matplotlib.text.Text at 0x7f121c583710>]

In [35]: plt.bar(pos, y_axis, width, color='lightblue')
Out[35]: <Container object of 21 artists>

In [36]: plt.xticks(rotation=30)
Out[36]:
(array([ 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5,
9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5, 16.5, 17.5,

18.5, 19.5, 20.5]), <a list of 21 Text xticklabel objects>)

In [37]: fig = matplotlib.pyplot.gcf()

In [38]: fig.set_size_inches(16, 10)

In [39]: count_by_occupation2 = user_fields.map(lambda fields: fields[3]).countByValue()

In [46]: print "Map-reduce approach: "
Map-reduce approach:

In [47]: print dict(count_by_occupation)
{u'administrator': 79, u'writer': 45, u'retired': 14, u'lawyer': 12, u'doctor': 7, u'marketing': 26, u'executive': 32, u'none': 9, u'entertainment': 18, u'healthcare': 16, u'scientist': 31, u'student': 196, u'educator': 95, u'technician': 27, u'librarian': 51, u'programmer': 66, u'artist': 28, u'salesman': 12, u'other': 105, u'homemaker': 7, u'engineer': 67}

In [48]: print ""

In [49]: print "countByValue approach:"
countByValue approach:

In [50]: print dict(count_by_occupation2)
{u'administrator': 79, u'retired': 14, u'lawyer': 12, u'healthcare': 16, u'marketing': 26, u'executive': 32, u'scientist': 31, u'student': 196, u'technician': 27, u'librarian': 51, u'programmer': 66, u'salesman': 12, u'homemaker': 7, u'engineer': 67, u'none': 9, u'doctor': 7, u'writer': 45, u'entertainment': 18, u'other': 105, u'educator': 95, u'artist': 28}

In [51]: movie_data=sc.textFile("file:///root/studio/MachineLearningWithSpark/ch03/ml-100k/u.item")

In [52]: print movie_data.first()
1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0

In [53]: num_movies = movie_data.count()

In [54]: print "Movies: %d " % num_movies
Movies: 1682

In [51]: movie_data=sc.textFile("file:///root/studio/MachineLearningWithSpark/ch03/ml-100k/u.item")

In [52]: print movie_data.first()
1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0

In [53]: num_movies = movie_data.count()

In [54]: print "Movies: %d " % num_movies
Movies: 1682

In [55]: def convert_year(x):
....: try:
....: return int(x[-4:])
....: except:
....: return 1990
....:

In [56]: movie_fields = movie_data.map(lambda lines: lines.split("|"))

In [57]: years = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x))

In [58]: years_filtered = years.filter(lambda x: x != 1900)

In [59]: movie_ages = years_filtered.map(lambda yr: 1998 - yr).countByValue()

In [60]: values = movie_ages.values()

In [61]: bins = movie_ages.keys()

In [62]: hist(values, bins=bins, color='lightblue', normed=True)
Out[62]:
(array([ 0. , 0.07575758, 0.09090909, 0.09090909, 0.18181818,
0.18181818, 0.04545455, 0.07575758, 0.07575758, 0.03030303,
0. , 0.01515152, 0.01515152, 0.03030303, 0. ,
0.03030303, 0. , 0. , 0. , 0. ,
0. , 0. , 0.01515152, 0. , 0. ,
0.01515152, 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.01515152, 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0.01515152, 0. , 0. , 0. , 0. ]),
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
68, 72, 76]),
<a list of 70 Patch objects>)

In [63]: fig = matplotlib.pyplot.gcf()

In [64]: fig.set_size_inches(16, 10)

In [65]: rating_data = sc.textFile("file:///root/studio/MachineLearningWithSpark/ch03/ml-100k/u.data")

In [66]: print rating_data.first()
196 242 3 881250949

In [67]: num_ratings = rating_data.count()

In [68]: print "Ratings: %d " % num_ratings
Ratings: 100000

In [76]: rating_data = rating_data.map(lambda line: line.split(" "))

In [77]: ratings = rating_data.map(lambda fields: int(fields[2]))

In [78]: max_rating = ratings.reduce(lambda x, y: max(x, y))

In [79]: min_rating = ratings.reduce(lambda x, y: min(x, y))

In [80]: mean_rating = ratings.reduce(lambda x, y: x + y)/num_ratings

In [81]: median_rating = np.median(ratings.collect())

In [82]: ratings_per_uer = num_ratings / num_users

In [76]: rating_data = rating_data.map(lambda line: line.split(" "))

In [77]: ratings = rating_data.map(lambda fields: int(fields[2]))

In [78]: max_rating = ratings.reduce(lambda x, y: max(x, y))

In [79]: min_rating = ratings.reduce(lambda x, y: min(x, y))

In [80]: mean_rating = ratings.reduce(lambda x, y: x + y)/num_ratings

In [81]: median_rating = np.median(ratings.collect())

In [82]: ratings_per_uer = num_ratings / num_users

In [83]: ratings_per_movie = num_ratings / num_movies

In [84]: print "Min ratings: %d" % min_rating
Min ratings: 1

In [85]: print "Max ratings: %d" % max_rating
Max ratings: 5

In [86]: print "Average rating: %2.2f" % mean_rating
Average rating: 3.00

In [87]: print "Median rating: %d" % mean_rating
Median rating: 3

In [88]: print "Average # of ratings per user: %2.2f" % ratings_per_uer
Average # of ratings per user: 106.00

In [89]: print "Average # of ratings per movie: %2.2f" % ratings_per_movie
Average # of ratings per movie: 59.00

In [90]: ratings.stats()
Out[90]: (count: 100000, mean: 3.52986, stdev: 1.12566797076, max: 5.0, min: 1.0)

In [91]: count_by_rating = ratings.countByValue()

In [92]: x_axis = np.array(count_by_rating.keys())

In [93]: y_axis = np.array([float(c) for c in count_by_rating.values()])

In [94]: y_axis_normed = y_axis / y_axis.sum()

In [95]: pos = np.arange(len(x_axis))

In [96]: width = 1.0

In [97]: ax = plt.axes()

In [98]: ax.set_xticks(pos + (width / 2))
Out[98]:
[<matplotlib.axis.XTick at 0x7f121c371250>,
<matplotlib.axis.XTick at 0x7f121c360d90>,
<matplotlib.axis.XTick at 0x7f121c2e0e10>,
<matplotlib.axis.XTick at 0x7f121c2df5d0>,
<matplotlib.axis.XTick at 0x7f121c2dfd10>]

In [99]: ax.set_xticklabels(x_axis)
Out[99]:
[<matplotlib.text.Text at 0x7f121c290ed0>,
<matplotlib.text.Text at 0x7f121c298c90>,
<matplotlib.text.Text at 0x7f121c2df450>,
<matplotlib.text.Text at 0x7f121c2dfb90>,
<matplotlib.text.Text at 0x7f121c2fd310>]

In [100]:

In [100]: plt.bar(pos, y_axis_normed, width, color='lightblue')
Out[100]: <Container object of 5 artists>

In [101]: plt.xticks(rotation=30)
Out[101]: (array([ 0.5, 1.5, 2.5, 3.5, 4.5]), <a list of 5 Text xticklabel objects>)

In [102]: fig = matplotlib.pyplot.gcf()

In [103]: fig.set_size_inches(16, 10)

In [104]: user_ratings_grouped = rating_data.map(lambda fields: (int(fields[0]), int(fields[2]))).groupByKey()

In [105]: user_ratings_by_user = user_ratings_grouped.map(lambda (k, v): (k, len(v)))

In [106]: user_ratings_by_user.take(5)
Out[106]: [(2, 62), (4, 24), (6, 211), (8, 59), (10, 184)]

In [107]: user_ratings_by_user_local = user_ratings_by_user.map(lambda (k, v): v).collect()

In [108]: hist(user_ratings_by_user_local, bins=200, color='lightblue', normed=True)
Out[108]:
(array([ 0.02958007, 0.02129765, 0.01212783, 0.01212783, 0.00798662,
0.00946562, 0.00916982, 0.00739502, 0.00769082, 0.00621181,
0.00887402, 0.00532441, 0.00562021, 0.00414121, 0.00384541,
0.00532441, 0.00236641, 0.00354961, 0.0017748 , 0.0017748 ,
0.00295801, 0.00266221, 0.00325381, 0.00414121, 0.00414121,
0.00266221, 0.0017748 , 0.00236641, 0.00266221, 0.00295801,
0.0020706 , 0.0020706 , 0.00354961, 0.0017748 , 0.00236641,
0.00384541, 0.0017748 , 0.00295801, 0.001479 , 0.00266221,
0.0011832 , 0.001479 , 0.0017748 , 0.0008874 , 0.001479 ,
0.00236641, 0.0020706 , 0.001479 , 0.0008874 , 0.001479 ,
0.0008874 , 0.0020706 , 0.0011832 , 0.0008874 , 0.0020706 ,
0.0002958 , 0.0017748 , 0.0011832 , 0.0011832 , 0.0017748 ,
0.001479 , 0.0011832 , 0.0008874 , 0.0002958 , 0.0005916 ,
0.0002958 , 0.0008874 , 0.0008874 , 0.0002958 , 0.0008874 ,
0.0017748 , 0.001479 , 0.0008874 , 0.0008874 , 0.0005916 ,
0. , 0.0011832 , 0.0002958 , 0.0002958 , 0.0011832 ,
0.0002958 , 0.0005916 , 0.0005916 , 0.0005916 , 0.0005916 ,
0.0008874 , 0. , 0.0008874 , 0. , 0.0002958 ,
0. , 0. , 0.0002958 , 0. , 0.0011832 ,
0.0002958 , 0.0002958 , 0.0002958 , 0. , 0.0002958 ,
0.0005916 , 0. , 0.0011832 , 0. , 0. ,
0.0008874 , 0.0002958 , 0.0002958 , 0. , 0.0002958 ,
0. , 0. , 0. , 0. , 0. ,
0.0005916 , 0. , 0. , 0. , 0.0002958 ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.0002958 , 0.0002958 ,
0. , 0.0005916 , 0. , 0. , 0. ,
0. , 0. , 0. , 0.0002958 , 0. ,
0. , 0. , 0. , 0. , 0. ,
0.0002958 , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0.0002958 , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0.0002958 , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0.0002958 ]),
array([ 20. , 23.585, 27.17 , 30.755, 34.34 , 37.925,
41.51 , 45.095, 48.68 , 52.265, 55.85 , 59.435,
63.02 , 66.605, 70.19 , 73.775, 77.36 , 80.945,
84.53 , 88.115, 91.7 , 95.285, 98.87 , 102.455,
106.04 , 109.625, 113.21 , 116.795, 120.38 , 123.965,
127.55 , 131.135, 134.72 , 138.305, 141.89 , 145.475,
149.06 , 152.645, 156.23 , 159.815, 163.4 , 166.985,
170.57 , 174.155, 177.74 , 181.325, 184.91 , 188.495,
192.08 , 195.665, 199.25 , 202.835, 206.42 , 210.005,
213.59 , 217.175, 220.76 , 224.345, 227.93 , 231.515,
235.1 , 238.685, 242.27 , 245.855, 249.44 , 253.025,
256.61 , 260.195, 263.78 , 267.365, 270.95 , 274.535,
278.12 , 281.705, 285.29 , 288.875, 292.46 , 296.045,
299.63 , 303.215, 306.8 , 310.385, 313.97 , 317.555,
321.14 , 324.725, 328.31 , 331.895, 335.48 , 339.065,
342.65 , 346.235, 349.82 , 353.405, 356.99 , 360.575,
364.16 , 367.745, 371.33 , 374.915, 378.5 , 382.085,
385.67 , 389.255, 392.84 , 396.425, 400.01 , 403.595,
407.18 , 410.765, 414.35 , 417.935, 421.52 , 425.105,
428.69 , 432.275, 435.86 , 439.445, 443.03 , 446.615,
450.2 , 453.785, 457.37 , 460.955, 464.54 , 468.125,
471.71 , 475.295, 478.88 , 482.465, 486.05 , 489.635,
493.22 , 496.805, 500.39 , 503.975, 507.56 , 511.145,
514.73 , 518.315, 521.9 , 525.485, 529.07 , 532.655,
536.24 , 539.825, 543.41 , 546.995, 550.58 , 554.165,
557.75 , 561.335, 564.92 , 568.505, 572.09 , 575.675,
579.26 , 582.845, 586.43 , 590.015, 593.6 , 597.185,
600.77 , 604.355, 607.94 , 611.525, 615.11 , 618.695,
622.28 , 625.865, 629.45 , 633.035, 636.62 , 640.205,
643.79 , 647.375, 650.96 , 654.545, 658.13 , 661.715,
665.3 , 668.885, 672.47 , 676.055, 679.64 , 683.225,
686.81 , 690.395, 693.98 , 697.565, 701.15 , 704.735,
708.32 , 711.905, 715.49 , 719.075, 722.66 , 726.245,
729.83 , 733.415, 737. ]),
<a list of 200 Patch objects>)

In [109]: fig = matplotlib.pyplot.gcf()

In [110]: fig.set_size_inches(16, 10)

In [111]: hist(user_ratings_by_user_local, bins=200, color='lightblue', normed=True)
Out[111]:
(array([ 0.02958007, 0.02129765, 0.01212783, 0.01212783, 0.00798662,
0.00946562, 0.00916982, 0.00739502, 0.00769082, 0.00621181,
0.00887402, 0.00532441, 0.00562021, 0.00414121, 0.00384541,
0.00532441, 0.00236641, 0.00354961, 0.0017748 , 0.0017748 ,
0.00295801, 0.00266221, 0.00325381, 0.00414121, 0.00414121,
0.00266221, 0.0017748 , 0.00236641, 0.00266221, 0.00295801,
0.0020706 , 0.0020706 , 0.00354961, 0.0017748 , 0.00236641,
0.00384541, 0.0017748 , 0.00295801, 0.001479 , 0.00266221,
0.0011832 , 0.001479 , 0.0017748 , 0.0008874 , 0.001479 ,
0.00236641, 0.0020706 , 0.001479 , 0.0008874 , 0.001479 ,
0.0008874 , 0.0020706 , 0.0011832 , 0.0008874 , 0.0020706 ,
0.0002958 , 0.0017748 , 0.0011832 , 0.0011832 , 0.0017748 ,
0.001479 , 0.0011832 , 0.0008874 , 0.0002958 , 0.0005916 ,
0.0002958 , 0.0008874 , 0.0008874 , 0.0002958 , 0.0008874 ,
0.0017748 , 0.001479 , 0.0008874 , 0.0008874 , 0.0005916 ,
0. , 0.0011832 , 0.0002958 , 0.0002958 , 0.0011832 ,
0.0002958 , 0.0005916 , 0.0005916 , 0.0005916 , 0.0005916 ,
0.0008874 , 0. , 0.0008874 , 0. , 0.0002958 ,
0. , 0. , 0.0002958 , 0. , 0.0011832 ,
0.0002958 , 0.0002958 , 0.0002958 , 0. , 0.0002958 ,
0.0005916 , 0. , 0.0011832 , 0. , 0. ,
0.0008874 , 0.0002958 , 0.0002958 , 0. , 0.0002958 ,
0. , 0. , 0. , 0. , 0. ,
0.0005916 , 0. , 0. , 0. , 0.0002958 ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.0002958 , 0.0002958 ,
0. , 0.0005916 , 0. , 0. , 0. ,
0. , 0. , 0. , 0.0002958 , 0. ,
0. , 0. , 0. , 0. , 0. ,
0.0002958 , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0.0002958 , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0.0002958 , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0.0002958 ]),
array([ 20. , 23.585, 27.17 , 30.755, 34.34 , 37.925,
41.51 , 45.095, 48.68 , 52.265, 55.85 , 59.435,
63.02 , 66.605, 70.19 , 73.775, 77.36 , 80.945,
84.53 , 88.115, 91.7 , 95.285, 98.87 , 102.455,
106.04 , 109.625, 113.21 , 116.795, 120.38 , 123.965,
127.55 , 131.135, 134.72 , 138.305, 141.89 , 145.475,
149.06 , 152.645, 156.23 , 159.815, 163.4 , 166.985,
170.57 , 174.155, 177.74 , 181.325, 184.91 , 188.495,
192.08 , 195.665, 199.25 , 202.835, 206.42 , 210.005,
213.59 , 217.175, 220.76 , 224.345, 227.93 , 231.515,
235.1 , 238.685, 242.27 , 245.855, 249.44 , 253.025,
256.61 , 260.195, 263.78 , 267.365, 270.95 , 274.535,
278.12 , 281.705, 285.29 , 288.875, 292.46 , 296.045,
299.63 , 303.215, 306.8 , 310.385, 313.97 , 317.555,
321.14 , 324.725, 328.31 , 331.895, 335.48 , 339.065,
342.65 , 346.235, 349.82 , 353.405, 356.99 , 360.575,
364.16 , 367.745, 371.33 , 374.915, 378.5 , 382.085,
385.67 , 389.255, 392.84 , 396.425, 400.01 , 403.595,
407.18 , 410.765, 414.35 , 417.935, 421.52 , 425.105,
428.69 , 432.275, 435.86 , 439.445, 443.03 , 446.615,
450.2 , 453.785, 457.37 , 460.955, 464.54 , 468.125,
471.71 , 475.295, 478.88 , 482.465, 486.05 , 489.635,
493.22 , 496.805, 500.39 , 503.975, 507.56 , 511.145,
514.73 , 518.315, 521.9 , 525.485, 529.07 , 532.655,
536.24 , 539.825, 543.41 , 546.995, 550.58 , 554.165,
557.75 , 561.335, 564.92 , 568.505, 572.09 , 575.675,
579.26 , 582.845, 586.43 , 590.015, 593.6 , 597.185,
600.77 , 604.355, 607.94 , 611.525, 615.11 , 618.695,
622.28 , 625.865, 629.45 , 633.035, 636.62 , 640.205,
643.79 , 647.375, 650.96 , 654.545, 658.13 , 661.715,
665.3 , 668.885, 672.47 , 676.055, 679.64 , 683.225,
686.81 , 690.395, 693.98 , 697.565, 701.15 , 704.735,
708.32 , 711.905, 715.49 , 719.075, 722.66 , 726.245,
729.83 , 733.415, 737. ]),
<a list of 200 Patch objects>)

3.3. 处理与转换数据；

In [112]: years_pre_processed = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x)).collect()

In [113]: years_pre_processed_array = np.array(years_pre_processed)

In [114]: mean_year = np.mean(years_pre_processed_array[years_pre_processed_array != 1900])

In [115]: median_year = np.median(years_pre_processed_array[years_pre_processed_array != 1900])

In [122]: index_bad_data = np.where(years_pre_processed_array == 1900)[0]

In [123]: index_bad_data
Out[123]: array([], dtype=int64)

In [124]: years_pre_processed_array[index_bad_data] = median_year

In [125]: print "Mean year of release: %d" % mean_year
Mean year of release: 1989

In [126]: print "Median year of release: %d" % median_year
Median year of release: 1995

In [130]: print "Index of '1900' after assigning median: %s" % np.where(years_pre_processed_array == 1900)[0]
Index of '1900' after assigning median: []

3.4.从数据中提取有用特征:

In [131]: all_occupations = user_fields.map(lambda fields: fields[3]).distinct().collect()

In [132]: all_occupations.sort()

In [133]:

In [133]: idx = 0

In [134]: all_occupations_dict = {}

In [135]: for o in all_occupations:
.....: all_occupations_dict[o] = idx
.....: idx += 1
.....:

In [136]: print "Encoding of 'doctor': %d" %all_occupations_dict['doctor']
Encoding of 'doctor': 2

In [137]: print "Encoding of 'programmer': %d" %all_occupations_dict['programmer']
Encoding of 'programmer': 14

In [139]: k = len(all_occupations_dict)

In [140]: binary_x = np.zeros(k)

In [141]: k_programmer = all_occupations_dict['programmer']

In [142]: binary_x[k_programmer] = 1

In [143]: print "Binary feature vector: %s" %binary_x
Binary feature vector: [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 0.]

In [144]: print "Length of binary vector: %d" %k
Length of binary vector: 21

In [145]: def extract_datetime(ts):
.....: import datetime
.....: return datetime.datetime.fromtimestamp(ts)
.....:

In [149]: timestamps = rating_data.map(lambda fields: int(fields[3]))

In [150]: hour_of_day = timestamps.map(lambda ts: extract_datetime(ts).hour)

In [151]: hour_of_day.take(5)
Out[151]: [23, 3, 15, 13, 13]

In [154]: def assign_tod(hr):
.....: times_of_day = {
.....: 'morning' : range(7, 12),
.....: 'lunch' : range(12, 14),
.....: 'afternoon' : range(14, 18),
.....: 'evening' : range(18, 23),
.....: 'night' : range(23, 7)
.....: }
.....: for k, v in times_of_day.iteritems():
.....: if hr in v:
.....: return k
.....:

In [166]: def assign_tod(hr):
.....: times_of_day = {
.....: 'morning' : range(7, 12),
.....: 'lunch' : range(12, 14),
.....: 'afternoon' : range(14, 18),
.....: 'evening' : range(18, 23),
.....: 'night' : range(23, 24) + range(0, 7)
.....: }
.....: for k, v in times_of_day.iteritems():
.....: if hr in v:
.....: return k
.....:

In [167]:

In [167]: time_of_day = hour_of_day.map(lambda hr: assign_tod(hr))

In [168]: time_of_day.take(5)
Out[168]: ['night', 'night', 'afternoon', 'lunch', 'lunch']

In [170]: def extract_titile(raw):
.....: import re
.....: grps = re.search("((w+))", raw)
.....: if grps:
.....: return raw[:grps.start()].strip()
.....: else:
.....: return raw
.....:

In [171]: raw_titles = movie_fields.map(lambda fields: fields[1])

In [172]: for raw_title in raw_titles.take(5):
.....: print extract_titile(raw_title)
.....:
Toy Story
GoldenEye
Four Rooms
Get Shorty
Copycat

In [173]: movie_titles = raw_titles.map(lambda m: extract_titile(m))

In [174]: title_terms = movie_titles.map(lambda t: t.split(" "))

In [175]: print title_terms.take(5)
[[u'Toy', u'Story'], [u'GoldenEye'], [u'Four', u'Rooms'], [u'Get', u'Shorty'], [u'Copycat']]

In [176]: all_terms = title_terms.flatMap(lambda x: x).distinct().collect()

In [177]: idx = 0

In [178]: all_terms_dict = {}

In [179]: for term in all_terms:
.....: all_occupations_dict[term] = idx
.....: idx += 1
.....:

In [180]: print "Total number of terms: %d" % len(all_terms_dict)
Total number of terms: 0

In [181]: print "Index of term 'Dead': %d" % all_occupations_dict['Dead']
Index of term 'Dead': 147

In [182]: print "Index of term 'Rooms': %d" % all_occupations_dict['Rooms']
Index of term 'Rooms': 1963

In [184]: %paste

def create_vector(terms, term_dict):
from scipy import sparse as sp
num_terms = len(term_dict)
x = sp.csc_matrix((1, num_terms))
for t in terms:
if t in term_dict:
idx = term_dict[t]
x[0, idx] = 1
return x
## -- End pasted text --

In [185]:

In [185]: all_terms_bcast = sc.broadcast(all_terms_dict)

In [186]: term_vectors = title_terms.map(lambda terms: create_vector(terms, all_terms_bcast.value))

In [187]: term_vectors.take(5)
Out[187]:
[<1x0 sparse matrix of type '<type 'numpy.float64'>'
with 0 stored elements in Compressed Sparse Column format>,
<1x0 sparse matrix of type '<type 'numpy.float64'>'
with 0 stored elements in Compressed Sparse Column format>,
<1x0 sparse matrix of type '<type 'numpy.float64'>'
with 0 stored elements in Compressed Sparse Column format>,
<1x0 sparse matrix of type '<type 'numpy.float64'>'
with 0 stored elements in Compressed Sparse Column format>,
<1x0 sparse matrix of type '<type 'numpy.float64'>'
with 0 stored elements in Compressed Sparse Column format>]

In [188]: np.random.seed(42)

In [189]: x = np.random.randn(10)

In [190]: norm_x_2 = np.linalg.norm(x)

In [191]: normalized_x = x /norm_x_2

In [192]: print "x: %s" % x
x:
[ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696
1.57921282 0.76743473 -0.46947439 0.54256004]

In [193]: print "Normalized x: %s" % normalized_x
Normalized x:
[ 0.19172213 -0.05336737 0.24999534 0.58786029 -0.09037871 -0.09037237
0.60954584 0.29621508 -0.1812081 0.20941776]

In [194]: print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x)
2-Norm of normalized_x: 1.0000

In [199]: vector = sc.parallelize([x])

In [200]: from pyspark.mllib.feature import Normalizer

In [201]: normalizer = Normalizer()

In [202]: vector = sc.parallelize([x])

In [203]: normalized_x_mllib = normalizer.transform(vector).first().toArray()

In [204]: print "x: %s" % x
x:
[ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696
1.57921282 0.76743473 -0.46947439 0.54256004]

In [205]: print "2-Norm of x: %2.4f" % norm_x_2
2-Norm of x: 2.5908

In [206]: print "Normalized x MLlib: %s" % normalized_x_mllib
Normalized x MLlib:
[ 0.19172213 -0.05336737 0.24999534 0.58786029 -0.09037871 -0.09037237
0.60954584 0.29621508 -0.1812081 0.20941776]

In [207]: print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib)
2-Norm of normalized_x_mllib: 1.0000