线性回归的Spark实现 [Linear Regression / Machine Learning / Spark]

1- 问题提出

2- 线性回归

3- 理论推导

4- Python/Spark实现

 1 # -*- coding: utf-8 -*-
 2 from pyspark import SparkContext
 3 
 4 
 5 theta = [0, 0]
 6 alpha = 0.001
 7 
 8 sc = SparkContext('local')
 9 
10 def func_theta_x(x):
11     return sum([i * j for i, j in zip(theta, x)])
12 
13 def cost(x):
14     thx = func_theta_x(x)
15     return thx - x[-1]
16 
17 def partial_theta(x):
18     dif = cost(x)
19     return [dif * i for i in x[:-1]]
20 
21 rdd = sc.textFile('/home/freyr/linearRegression.txt')
22         .map(lambda line: map(float, line.strip().split('	')))
23 
24 maxiter = 400
25 iter = 0
26 while True:
27     parTheta = rdd.map(partial_theta)
28                   .reduce(lambda x, y: [i + j for i, j in zip(x, y)])
29 
30     for i in range(2):
31         theta[i] = theta[i] - alpha * parTheta[i]
32 
33     iter += 1
34 
35     if iter <= maxiter:
36         if sum(map(abs, parTheta)) <= 0.01:
37             print 'I get it!!!'
38             print 'Iter = %s' % iter
39             print 'Theta = %s' % theta
40             break
41     else:
42         print 'Failed...'
43         break

PS: 1. linearRegression.txt