shape into blocks--source code in python based on pySpark

这是微博深度和广度预测的原始代码,写了大约半个月,第一个版本不是这样的,但是这个版本包含所有需要的功能。

模块化的程度也更高。找工作前一直想用python完美解决这个问题,后来发现自己的方法和硬件都有很大的局限。

算是我的第一次正儿八经的尝试在分布式计算的框架下,计算海量的数据。

意识到很多问题,影响我面试时候很多的代码风格。

def get_basic_info():
    win_path = "E:/spark/weibo_predict/"
    linux_path = "/home/jason/spark/weibo_predict/"
    path = linux_path

    train_path = path + 'train/'
    test_path = path + 'test/'
    code_path = path + 'source_code/'
    
    print('
训练准备文件保存路径px:%s' % train_path)
    print('
测试准备文件保存路径py:%s' % test_path)
    print('
代码准备文件保存路径pz:%s' % code_path)

    train_weibo_raw_path = path + "train_weibo_raw.txt"
    train_weibo_repost_path = path + "train_weibo_repost_back.txt" 

    test_weibo_raw_path = path + "test_weibo_raw.txt"
    test_weibo_repost_path = path + "test_weibo_repost.txt"

    user_relations_path = path + "user_relations_back.txt"         
                                                                                                                                                                                                                                                                                                                                            
    print("
训练原始微博地址p1:%s" % train_weibo_raw_path)
    print("训练转发微博地址p2:%s" % train_weibo_repost_path)
    print("
测试原始微博地址p3:%s" % test_weibo_raw_path)
    print("测试转发微博地址p4:%s" % test_weibo_repost_path)
    print("
    用户关系地址p5:%s" % user_relations_path)
    return train_path,test_path,code_path,train_weibo_raw_path,train_weibo_repost_path,test_weibo_raw_path,test_weibo_repost_path,user_relations_path
#传递  训练(原始微博,转发微博) 或者 测试(原始微博,转发微博)  
#返回化简后的对应关系repost_id_line_time_reduce
#返回微博id对应的用户idwid_uid_rdd
from pyspark import SparkContext
def get_prime_rdd(train_or_test,sc, p1,p2,p3,p4):
    if train_or_test == 'train':
        inside_path_a = p1
        inside_path_b = p2
    elif train_or_test == 'test':
        inside_path_a = p3
        inside_path_b = p4
    else:
        print("only input train or test")
        return 0,0
      
    sc = sc
    train_weibo_raw_data = sc.textFile(inside_path_a)
    train_weibo_raw_data_count = train_weibo_raw_data.count()
    train_weibo_raw_data_rdd = train_weibo_raw_data.map(lambda x: x.split("01"))
    w_id=train_weibo_raw_data_rdd.map(lambda x:x[0])
    u_id=train_weibo_raw_data_rdd.map(lambda x:x[1])
    wid_uid_rdd = w_id.zip(u_id)
    
    train_weibo_repost_data = sc.textFile(inside_path_b)
    train_weibo_repost_data_count = train_weibo_repost_data.count()
    train_weibo_repost_data_rdd = train_weibo_repost_data.map(lambda x: x.split("01"))
    repost_id = train_weibo_repost_data_rdd.map(lambda x: x[0])
    repost_line_time = train_weibo_repost_data_rdd.map(lambda x: x[1:-1])
    repost_id_line_time = repost_id.zip(repost_line_time)
    repost_id_line_time_reduce = repost_id_line_time.groupByKey().mapValues(list)
    
    repost_id_line_time_reduce = repost_id_line_time_reduce.subtractByKey(repost_id_line_time_reduce.subtractByKey(wid_uid_rdd))
    wid_uid_rdd = wid_uid_rdd.subtractByKey(wid_uid_rdd.subtractByKey(repost_id_line_time_reduce))
        
    return repost_id_line_time_reduce,wid_uid_rdd 
def get_uid_fnum_rdd(sc,p5):
    sc = sc
    user_relations_data = sc.textFile(p5)
    user_relations_data_count = user_relations_data.count()
    user_relations_data_rdd_1 = user_relations_data.map(lambda x: x.split("	")[0])
    user_relations_data_rdd_2 = user_relations_data.map(lambda x: x.split("	")[1])
    user_relations_data_rdd_user = user_relations_data_rdd_1
    user_relations_data_rdd_fans = user_relations_data_rdd_2.map(lambda x: x.split("x01"))
    user_fans = user_relations_data_rdd_user.zip(user_relations_data_rdd_fans)
    fans_nums = user_relations_data_rdd_fans.map(lambda s:len(s))
    uid_fnum_rdd = user_fans.keys().zip(fans_nums)
    return uid_fnum_rdd
##版本 2  分时间段计算指定时间段的转发量
def cal_times_j(list,j):
    ct = 0
    for i in range(len(list)):
        #if int(list[i][-1]) >= j*900 and int(list[i][-1]) <= (j+1)*900:
        #这里可以切换求累计的转发量还是区间的转发量
        if int(list[i][-1]) <= (j)*900:
            ct += 1
    return ct
def cal_id_times_j(rdd,j):
    times = rdd.values().map(lambda x: cal_times_j(x,j))
    rdd = rdd.keys().zip(times)
    return rdd 

def generate_times_file(rdd,k,path):
    for j in range(k-1,k+1):
        import csv
        a_path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
        #print(path)
        out_file_train_times_j = open(a_path,'w')
        writer = csv.writer(out_file_train_times_j);
        zhuanfa = cal_id_times_j(rdd,j+1)
        for lists in zhuanfa.collect():
            writer.writerow(lists)
        out_file_train_times_j.close()
#计算深度
#定义函数,计算出指定阶段的,发生过的转发关系
def cal_during(list,j):
    new_list=[]
    for i in range(len(list)):
        if int(list[i][-1]) <= j*900:
            new_list.append(list[i])
    return new_list

#定义函数,计算一个rdd中,指定阶段,发生过的转发关系
def cal_rdd_during(rdd,j):
    return rdd.map(lambda x: cal_during(x,j))
    
#定义函数,如果一个转发关系的尾部,是另外一个转发关系的头,那么久把这个头的尾部,加到这个转发关系的尾部
def add_deep(list):
    kkk = len(list)
    if kkk<=1:
        return list
    else:
        for i in range(kkk):
            for j in range(kkk):
                if list[i][-1] == list[j][0]:
                    list[i].append(list[j][-1])
    return list

#定义函数返回序列中的数组的最长的值,作为最大的深度
def max_deep(list):
    max=2
    if len(list)==0:
        return 0
    else:
        for i in range(len(list)):
            max = (len(list[i]) if len(list[i])> max else max)
    return max-1

#定义函数,取出其中的两列
def ti_qu(list):
    for i in range(len(list)):
        list[i] = list[i][:-1]
    return list

def cal_cal(all_in_one_rdd, j):
    id_rdd = all_in_one_rdd.keys()                     #获取ID的RDD
    line_time_rdd = all_in_one_rdd.values()            #获取转发关系和转发时间对应的RDD
    line_time_rdd_j = cal_rdd_during(line_time_rdd,j)  #指定时间段,获取这个时间段发生过的转发和时间组成的RDD
    line_rdd_j = line_time_rdd_j.map(lambda x : ti_qu(x))#提取转发关系
    line_rdd_j_extend = line_rdd_j.map(lambda x: add_deep(x))#延长转发关系
    line_rdd_j_extend_maxdeep = line_rdd_j_extend.map(lambda x:max_deep(x))#计算最大深度
    id_deep_rdd_j = id_rdd.zip(line_rdd_j_extend_maxdeep)#组合微博ID与深度
    return id_deep_rdd_j

def generate_deeps_file(rdd,k,path):
    import csv
    for j in range(k-1,k+1):
        b_path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
        #print(path)
        out_file_train_deeps_j = open(b_path,'w')
        writer = csv.writer(out_file_train_deeps_j);
        shendu = cal_cal(rdd,j+1)
        for lists in shendu.collect():
            writer.writerow(lists)
        out_file_train_deeps_j.close()
def get_wid_fnum_rdd(uid_fnum_rdd,wid_uid_rdd,path):
    #print("用户和粉丝个数的对应关系,取出来一个看看:")
    #print(uid_fnum_rdd.take(3))
    #print(uid_fnum_rdd.count())
    #print("
训练原始约减微博的id和发送微博的人的id的对应rdd:")
    #print(wid_uid_rdd.take(3))
    #print(wid_uid_rdd.count())
    uid_wid_rdd = wid_uid_rdd.values().zip(wid_uid_rdd.keys())
    uid__wid_fnum = uid_wid_rdd.leftOuterJoin(uid_fnum_rdd)
    wid_fnum_rdd = uid__wid_fnum.values().map(lambda x: x[0]).zip(uid__wid_fnum.values().map(lambda x: x[1])) 
    #print(wid_fnum_rdd.take(2))
    #print(wid_fnum_rdd.count())
    import csv
    c_path = str(path) + 'wid_fnum_file.csv'
    wid_fnum_file = open(c_path,"w")
    writer = csv.writer(wid_fnum_file);
    for lists in wid_fnum_rdd.collect():
        writer.writerow(lists);
    wid_fnum_file.close()
    
    return wid_fnum_rdd
#定义函数,将列表数组扁平化
def add_flat(list):
    if list==None:
        return 0
    else: 
        kkk = len(list)
        list0 = list[0]
        for i in range(kkk):
            if i==0:
                pass
            else:
                list0 = list0.append(list[i])
        return list0

#定义函数,计算覆盖用户数目
def clac_cover(list):
    total_cover=0
    for i in range(len(list)):
        total_cover += cover_value(list[i])
    return total_cover
        
#定义函数,计算某个用户的粉丝数:
def cover_value(user):
    '''
    try:
        return uid_fnum_dict[user]
    except:
        return 0
    '''
    for i in range(len(list_uid_fnum)):
        if user == list_uid_fnum[i][0]:
            return list_uid_fnum[i][1]
    else:
        return 0
def flatmapvalues(x):
    return x

def cal_sum(x):
    sum = 0
    if x==None and len(x)==0:
        return sum
    else:
        for i in range(len(x)):
            if x[i]== None:
                pass
            else:
                sum += int(x[i])
        return sum

def fans_cover_till_j(all_in_one_rdd,j):
    id_rdd = all_in_one_rdd.keys()                     #获取微博ID的RDD
    line_time_rdd = all_in_one_rdd.values()            #获取转发关系和转发时间对应的RDD
    line_time_rdd_j = cal_rdd_during(line_time_rdd,j)  #指定时间段,获取这个时间段发生过的转发和时间组成的RDD
    #print("
指定时间段,获取这个时间段发生过的转发和时间组成的RDD");print(line_time_rdd_j.first())
    line_rdd_j = line_time_rdd_j.map(lambda x : ti_qu(x))#提取转发关系
    #print("
提取转发关系");print(line_rdd_j.first())
    
    #line_rdd_j.flatMap(lambda x: re.sub(r'D'," ",x).split())
    #line_rdd_j_flat = line_rdd_j.map(lambda x: add_flat(x))#扁平化转发关系,不行
    import re
    line_rdd_j_flat = line_rdd_j.map(lambda x: re.sub(r'D'," ",str(x)).split())#扁平化转发关系
    #print("
提取扁平化的转发关系");print(line_rdd_j_flat.first())
    
    line_rdd_j_flat_disc = line_rdd_j_flat.map(lambda x:list(set(list(x))))   #扁平化之后约减重复的用户ID
    #print("
看看去重之后的转发用户");print(line_rdd_j_flat_disc.first())
    
    fans_cover_rdd_j = id_rdd.zip(line_rdd_j_flat_disc)
    #print("
看看去重之后的微博ID和转发用户");print(fans_cover_rdd_j.first())
    
    fans_cover_rdd_j = fans_cover_rdd_j.flatMapValues(flatmapvalues)
    #print("
看看去重之后的微博ID和转发用户,一对一flatmap之后");print(fans_cover_rdd_j.first())
    
    fans_cover_rdd_j = fans_cover_rdd_j.values().zip(fans_cover_rdd_j.keys())
    #print("
翻转id和用户");print(fans_cover_rdd_j.first())
    
    fans_cover_rdd_j = fans_cover_rdd_j.leftOuterJoin(uid_fnum_rdd).values()
    #print("
得到用户id_(微博ID,粉丝)");print(fans_cover_rdd_j.first())
    #print(fans_cover_rdd_j.count())
    
    fans_cover_rdd_j = fans_cover_rdd_j.map(lambda x: x[0]).zip(fans_cover_rdd_j.map(lambda x:x[1]))
    #print("
得微博id_粉丝");print(fans_cover_rdd_j.first())
    #print(fans_cover_rdd_j.count())
    
    fans_cover_rdd_j = fans_cover_rdd_j.groupByKey().mapValues(list)
    #print("
组合,");print(fans_cover_rdd_j.first())
    #print(fans_cover_rdd_j.count())
    
    fans_cover_rdd_j = fans_cover_rdd_j.keys().zip(fans_cover_rdd_j.values().map(lambda x: cal_sum(x)))
    #print("
map求和");print(fans_cover_rdd_j.first())
    
    #cover_rdd = line_rdd_j_flat_disc.map(lambda x: clac_cover(x))
    #fans_cover_rdd_j = id_rdd.zip(cover_rdd)#组合微博ID与覆盖数目
    #print(id_deep_rdd_j.first())
    #return line_rdd_j_extend_maxdeep
    temp_key_0 = all_in_one_rdd.keys().zip(all_in_one_rdd.values().map(lambda x: 0))
    
    fans_cover_rdd_j = temp_key_0.leftOuterJoin(fans_cover_rdd_j)
    fans_cover_rdd_j = fans_cover_rdd_j.keys().zip(fans_cover_rdd_j.values().map(lambda x: cal_sum(x)))
    
    
    return fans_cover_rdd_j

def generate_covers_file(rdd,k,path):
    #按理说没问题
    import csv
    for j in range(k-1,k+1):
        c_path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
        #print(c_path)
        out_file_train_covers_j = open(c_path,'w')
        writer = csv.writer(out_file_train_covers_j)
        covers  = fans_cover_till_j(rdd,j+1)
        for lists in covers.collect():
            writer.writerow(lists)
        out_file_train_covers_j.close()
px,py,pz,p1,p2,p3,p4,p5 = get_basic_info()
uid_fnum_rdd = get_uid_fnum_rdd(sc,p5)
train_repost_id_line_time_reduce, train_wid_uid_rdd = get_prime_rdd('train',sc,p1,p2,p3,p4)
#wid_fnum_rdd = get_wid_fnum_rdd(uid_fnum_rdd,train_wid_uid_rdd,px)
#generate_times_file(train_repost_id_line_time_reduce,292,px)
#generate_deeps_file(train_repost_id_line_time_reduce,292,px)
#generate_covers_file(train_repost_id_line_time_reduce,292,px)

test_repost_id_line_time_reduce, test_wid_uid_rdd = get_prime_rdd('test',sc,p1,p2,p3,p4)
#test_wid_fnum_rdd = get_wid_fnum_rdd(uid_fnum_rdd,test_wid_uid_rdd,py)
#generate_times_file(test_repost_id_line_time_reduce,16,py)
#generate_deeps_file(test_repost_id_line_time_reduce,16,py)
#generate_covers_file(test_repost_id_line_time_reduce,16,py)
from pyspark.mllib.regression import LabeledPoint
import numpy as np 
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import SparseVector,DenseVector

#获取用户ID和粉丝数的对比
def get_wid_fnum_rdd(path):
    path = path+ 'wid_fnum_file'+'.csv'
    wid_fnum_rdd = sc.textFile(path)
    wid_fnum_rdd = wid_fnum_rdd.map(lambda x:x.split(","))
    wid_fnum_rdd = wid_fnum_rdd.map(lambda x:x[0]).zip(wid_fnum_rdd.map(lambda x:x[1]))
    wid_fnum_rdd = wid_fnum_rdd.sortByKey()
    return wid_fnum_rdd

def add_keys(rdd1):
    rdd1 = rdd1
    #path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(15)+'.txt'
    #rdd1 = sc.textFile(path)
    rdd2 = sc.textFile('/home/jason/spark/weibo_predict/test/wid_times/wid_times_0.csv')
    rdd2 = rdd2.map(lambda x:x.split(',')[0]).zip(rdd2.map(lambda x:x.split(',')[1]))
    rdd2 = rdd2.sortByKey()
    rdd1 = rdd1.zipWithIndex()
    rdd1 = rdd1.values().zip(rdd1.keys())
    rdd2 = rdd2.keys().zipWithIndex()
    rdd2 = rdd2.values().zip(rdd2.keys())
    rdd = rdd2.join(rdd1)
    rdd = rdd.values()
    rdd = rdd.map(lambda x: x[0]).zip(rdd.map(lambda x: x[1]))
    return rdd

#获取其他三个需要的参数
def get_wid_x(j,path,times_or_deeps_or_covers):
    if times_or_deeps_or_covers == 'times':
        if path == px:
            path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
        elif path ==py:
            if j>=0 and j<15:
                path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
            elif j>=15 and j<=291:
                path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(j)+'.txt'
                rdd1 = sc.textFile(path)
                rdd = add_keys(rdd1)
                return rdd
    elif times_or_deeps_or_covers == 'deeps':
        if path == px:
            path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
        elif path ==py:
            if j>=0 and j<15:
                path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
            elif j>=15 and j<=291:
                path = '/home/jason/spark/weibo_predict/predicts/deeps_time_data_'+str(j)+'.txt'
                rdd1 = sc.textFile(path)
                rdd = add_keys(rdd1)
                return rdd    
    elif times_or_deeps_or_covers == 'covers':
        if path == px:
            path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
        elif path ==py:
            if j>=0 and j<15:
                path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
            elif j>=15 and j<=291:
                path = '/home/jason/spark/weibo_predict/predicts/covers_time_data_'+str(j)+'.txt'
                rdd1 = sc.textFile(path)
                rdd = add_keys(rdd1)
                return rdd
    else:
        print('wrong input about times_or_deeps_or_covers')
        return 0
    rdd = sc.textFile(path)
    rdd = rdd.map(lambda x:x.split(","))
    rdd = rdd.map(lambda x:x[0]).zip(rdd.map(lambda x:x[1]))
    rdd = rdd.sortByKey()
    return rdd



#将两个RDDjoin返回一个rdd的函数
def my_join(rdd1,rdd2):
    import re
    rdd = rdd1.join(rdd2).keys().zip(rdd1.join(rdd2).values().map(lambda x:re.sub(r'D',"  ",str(x)).split()))
    return rdd

#根据rdd的元素制作lib_svm格式文件
def lib_svm(x):
    str1 = str(x[0] + ' ')
    for i in range(len(x)):
        if i == 0:
            pass
        else:
            str1 += str(str(i) + ":" +str(x[i])+ ' ')
    return str1

#生成测试或者训练需要的数据
def generate_train_or_test_data(path,j,times_or_deeps):
    if times_or_deeps == 'times':
        if path == px:
            data_path = str(px) + 'train_data/times_train_data_'+str(j)+'.txt'
            wid_times_rdd = get_wid_x(j+1,path,'times')
        elif path == py:
            data_path = str(py) + 'test_data/times_test_data_'+str(j)+'.txt'
            wid_times_rdd = get_wid_x(j,path,'times')
            #print(wid_times_rdd.count())
        else:
            return 0
        wid_fnum_rdd = get_wid_fnum_rdd(path)
        wid_deeps_rdd = get_wid_x(j,path,'deeps')
        wid_covers_rdd = get_wid_x(j,path,'covers')
        #wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
        records = my_join(wid_times_rdd,wid_fnum_rdd)
        records = my_join(records,wid_deeps_rdd)
        records = my_join(records,wid_covers_rdd)
        records = records.sortByKey()
        #print('看看训练集合中的keys()的顺序-------------------------------------------')
        #print(records.keys().take(10))
        records = records.values()
        data = records.map(lambda x:lib_svm(x))
        open_data_path = open(data_path,'w')
        for lines in data.collect():
            open_data_path.write(lines)
            open_data_path.write('
')
    elif times_or_deeps == 'deeps':
        if path == px:
            data_path = str(px) + 'train_data/deeps_train_data_'+str(j)+'.txt'
        elif path == py:
            data_path = str(py) + 'test_data/deeps_test_data_'+str(j)+'.txt'
        else:
            return 0
        wid_fnum_rdd = get_wid_fnum_rdd(path)
        if path == py:
            wid_deeps_rdd = get_wid_x(j,path,'deeps')
        else:
            wid_deeps_rdd = get_wid_x(j+1,path,'deeps')
        wid_times_rdd = get_wid_x(j,path,'times')
        wid_deeps_rdd = get_wid_x(j,path,'deeps')
        wid_covers_rdd = get_wid_x(j,path,'covers')
        #wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
        records = my_join(wid_deeps_rdd,wid_fnum_rdd)
        records = my_join(records,wid_times_rdd)
        records = my_join(records,wid_covers_rdd)
        records = records.values()
        data = records.map(lambda x:lib_svm(x))
        open_data_path = open(data_path,'w')
        for lines in data.collect():
            open_data_path.write(lines)
            open_data_path.write('
')
        open_data_path.close()
    elif times_or_deeps == 'covers':
        if path == px:
            data_path = str(px) + 'train_data/covers_train_data_'+str(j)+'.txt'
        elif path == py:
            data_path = str(py) + 'test_data/covers_test_data_'+str(j)+'.txt'
        else:
            return 0
        wid_fnum_rdd = get_wid_fnum_rdd(path)
        if path == py:
            wid_covers_rdd = get_wid_x(j,path,'covers')
        else:
            wid_covers_rdd = get_wid_x(j+1,path,'covers')
        #wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
        wid_times_rdd = get_wid_x(j,path,'times')
        wid_deeps_rdd = get_wid_x(j,path,'deeps')
        
        records = my_join(wid_covers_rdd,wid_fnum_rdd)
        records = my_join(records,wid_times_rdd)
        records = my_join(records,wid_deeps_rdd)
        records = records.values()
        data = records.map(lambda x:lib_svm(x))
        open_data_path = open(data_path,'w')
        for lines in data.collect():
            open_data_path.write(lines)
            open_data_path.write('
')
        open_data_path.close()
    else:
        return 0



#生成指定时段的预测结果
def generate_test_predict(j,times_or_deeps):
    if times_or_deeps == 'times':
        from pyspark.mllib.tree import RandomForest, RandomForestModel
        from pyspark.mllib.util import MLUtils
        tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'times_train_data_'+str(j)+'.txt'
        te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'times_test_data_'+str(j)+'.txt'
        train_data = MLUtils.loadLibSVMFile(sc,tr_path)
        test_data = MLUtils.loadLibSVMFile(sc,te_path)
        model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
                                            numTrees=3, featureSubsetStrategy="auto",
                                            impurity='variance', maxDepth=4, maxBins=32,seed=42)
        predictions = model.predict(test_data.map(lambda x: x.features))
        pre_path = '/home/jason/spark/weibo_predict/predicts/'+'times_time_data_'+str(j+1)+'.txt'
        times_predict = open(pre_path,'w')
        for lines in predictions.collect():
            times_predict.write(str(int(lines)))
            times_predict.write('
')
        times_predict.close()
    elif times_or_deeps == 'deeps':
        from pyspark.mllib.tree import RandomForest, RandomForestModel
        from pyspark.mllib.util import MLUtils
        tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'deeps_train_data_'+str(j)+'.txt'
        te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'deeps_test_data_'+str(j)+'.txt'
        train_data = MLUtils.loadLibSVMFile(sc,tr_path)
        test_data = MLUtils.loadLibSVMFile(sc,te_path)
        model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
                                            numTrees=3, featureSubsetStrategy="auto",
                                            impurity='variance', maxDepth=4, maxBins=32,seed=42)
        predictions = model.predict(test_data.map(lambda x: x.features))
        pre_path = '/home/jason/spark/weibo_predict/predicts/'+'deeps_time_data_'+str(j+1)+'.txt'
        times_predict = open(pre_path,'w')
        for lines in predictions.collect():
            times_predict.write(str(int(lines)))
            times_predict.write('
')
        times_predict.close()
    elif times_or_deeps == 'covers':
        from pyspark.mllib.tree import RandomForest, RandomForestModel
        from pyspark.mllib.util import MLUtils
        tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'covers_train_data_'+str(j)+'.txt'
        te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'covers_test_data_'+str(j)+'.txt'
        train_data = MLUtils.loadLibSVMFile(sc,tr_path)
        test_data = MLUtils.loadLibSVMFile(sc,te_path)
        model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
                                            numTrees=3, featureSubsetStrategy="auto",
                                            impurity='variance', maxDepth=4, maxBins=32,seed=42)
        predictions = model.predict(test_data.map(lambda x: x.features))
        pre_path = '/home/jason/spark/weibo_predict/predicts/'+'covers_time_data_'+str(j+1)+'.txt'
        times_predict = open(pre_path,'w')
        for lines in predictions.collect():
            times_predict.write(str(int(lines)))
            times_predict.write('
')
        times_predict.close()
        
    
    
def generate_test_data_beyond15(j):
    path = '/home/jason/spark/weibo_predict/predicts/'+'time_data_'+str(j)+'.txt'
    rdd2 = sc.textFile(path)
    rdd1 = get_wid_fnum_rdd(py).keys()
    rdd = rdd1.zip(rdd2)
    return rdd
    
def add_keys(rdd1):
    rdd1 = rdd1
    #path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(15)+'.txt'
    #rdd1 = sc.textFile(path)
    rdd2 = sc.textFile('/home/jason/spark/weibo_predict/test/wid_times/wid_times_0.csv')
    rdd2 = rdd2.map(lambda x:x.split(',')[0]).zip(rdd2.map(lambda x:x.split(',')[1]))
    rdd2 = rdd2.sortByKey()
    rdd1 = rdd1.zipWithIndex()
    rdd1 = rdd1.values().zip(rdd1.keys())
    rdd2 = rdd2.keys().zipWithIndex()
    rdd2 = rdd2.values().zip(rdd2.keys())
    rdd = rdd2.join(rdd1)
    rdd = rdd.values()
    rdd = rdd.map(lambda x: x[0]).zip(rdd.map(lambda x: x[1]))
    return rdd
for i in range(15):
    generate_train_or_test_data(px,i,'times')
    generate_train_or_test_data(py,i,'times')
    generate_test_predict(i,'times')
    generate_train_or_test_data(px,i,'deeps')
    generate_train_or_test_data(py,i,'deeps')
    generate_test_predict(i,'deeps')
    generate_train_or_test_data(px,i,'covers')
    generate_train_or_test_data(py,i,'covers')
    generate_test_predict(i,'covers')
for i in range(15,292):
    print(i)
    generate_train_or_test_data(px,i,'times')
    generate_train_or_test_data(py,i,'times')
    generate_test_predict(i,'times')
    generate_train_or_test_data(px,i,'deeps')
    generate_train_or_test_data(py,i,'deeps')
    generate_test_predict(i,'deeps')
    generate_train_or_test_data(px,i,'covers')
    generate_train_or_test_data(py,i,'covers')
    generate_test_predict(i,'covers')
generate_train_or_test_data(px,291,'times')
generate_train_or_test_data(py,291,'times')
generate_test_predict(291,'times')
generate_train_or_test_data(px,291,'deeps')
generate_train_or_test_data(py,291,'deeps')
generate_test_predict(291,'deeps')
generate_train_or_test_data(px,291,'covers')
generate_train_or_test_data(py,291,'covers')
generate_test_predict(291,'covers')
#组团搞出来最后的文件

rdd1 = sc.textFile('/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(1)+'.txt')
rdd1 = add_keys(rdd1)
for j in range(4,292):
    j = j+1
    if j==1:
        pass
    else:
        rdd2 = sc.textFile('/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(j)+'.txt')
        rdd2 = add_keys(rdd2)
        rdd1 = my_join(rdd1,rdd2)
    
for j in range(4,292):
    j=j+1
    rdd3 = sc.textFile('/home/jason/spark/weibo_predict/predicts/deeps_time_data_'+str(j)+'.txt')
    rdd3 = add_keys(rdd3)
    rdd1 = my_join(rdd1,rdd3)
    
    
def add_head(x):
    str1 = 'testWeibo'
    str1 = str1+str(x)
    return str1

import re  
rdd1 = rdd1.map(lambda x: re.sub(r'D'," ",str(x)).split())
rdd1 = rdd1.sortBy(lambda x: int(x[0]))
rdd1 = rdd1.map(lambda x:x[0]).zip(rdd1.map(lambda x:x[1:]))
rdd1_key = rdd1.keys().map(lambda x:add_head(x))
rdd1 = rdd1_key.zip(rdd1.values())
rdd1 = rdd1.map(lambda x: re.sub(r'D'," ",str(x)).split())

import csv
path = '/home/jason/spark/weibo_predict/'
end_path = str(path) + 'end_of_end.csv'
end_f = open(end_path,'w')
writer = csv.writer(end_f)
for lists in rdd1.collect():
    writer.writerow(lists)
end_f.close()
a=','
s1 = ['scaleT'+str((i+1)*15) for i in range(4,292)]
s1 = a.join(s1)
s2 = ['depthT'+str((i+1)*15) for i in range(4,292)]
s2 = a.join(s2)
s3 = 'WeiboID (Time Unit: Minutes)'+a+s1+s2
#print(s3)
end_path_2 = '/home/jason/spark/weibo_predict/end_of_end.csv'
end_path_1 = '/home/jason/spark/weibo_predict/end_of_end_.csv'
rdd = sc.textFile(end_path_2)
rdd = rdd.map(lambda x:add_head(x))
end_ff = open(end_path_1,'w')
end_ff.write(s3)
end_ff.write('
')
for lists in rdd.collect():
    end_ff.write(lists)
    end_ff.write('
')
end_ff.close()

万事走心 精益求美


原文地址:https://www.cnblogs.com/kongchung/p/6013477.html