字段分隔,每个字段一列

#将原文件存入字典
import os
import copy
import codecs 
os.chdir('/Users/zhangb/Desktop/数据挖掘文件/取数流程')
f_in = codecs.open('yalu1115','r','utf-8')
source_dic = {}
for i in f_in.readlines():
    line = i.strip().split('|')
    key = '|'.join(line[0:9])
    value = line[-1].split(',')
    #print value 
    source_dic[key] = value
f_in.close()
#将维表写入字典  {'011100':['年龄段','0-17岁']}
f_in = codecs.open('dim_tags.txt','r','utf-8')
dim_dic = {}
for i in f_in.readlines():
    line = i.strip().split('|')
    dim_dic[line[0]] = [line[1],line[2]]  
f_in.close()
#print(dim_dic.keys())
#生成middle层转化用的的索引
ind = [i for i in dim_dic.keys()]
#print(ind)
#生成中间层,将原表中的tag_id替换成为 tag_name,若匹配不上则去除。
middle_dic = {}
for k in source_dic.keys():
    middle_dic[k] = []
for k,v in source_dic.items():
    for i in ind:
        if i in v:
            middle_dic[k].append(dim_dic[i][1])
        else:
            middle_dic[k].append('')
#print(middle_dic)
#按表头来组织中间层,确保每个id都有所有的tag_name字段,没有的tag则留空,
#要求所有人的tag字段都对齐,比如'男'的列位上,只能是'男' 或者空字符
ff = codecs.open('dim_tags_name.txt','r','utf-8') 
sorted_list = [i.strip() for i  in ff.readlines()]
#print(sorted_list)
ff.close()
sort_dic = {}
for k in source_dic.keys():
    sort_dic[k] = []
for k,v in middle_dic.items():
    for i in sorted_list:
        if i in v:
            sort_dic[k].append(i)
        else:
            sort_dic[k].append('')
#print(sort_dic)
#另外可以生成一个0-1矩阵,适合计算。只要把i换成1。
#现在已经有了对齐的列表,剩下就是把这些列归类,不一定要将value中的同类元素打包成元组
#把前8个变量分别弄成1列
for k,v in sort_dic.items():

    #年龄1
    age=''
    for i in range(6):
        #print(v[i])
        if len(v[i])>0:
            age=v[i]
    #性别精准2 
    gender_true=''      
    for i in range(6,8):
        if len(v[i])>0:
           gender_true=v[i]    
    #性别3 
    gender=''   
    for i in range(8,10):
        if len(v[i])>0:
           gender=v[i]    
    #有小孩4    
    parent=''   
    for i in range(10,18):
        if len(v[i])>0:
           parent=v[i]    
    #消费水平5  
    consumption=''   
    for i in range(18,21):
        if len(v[i])>0:
           consumption=v[i]    
    #婚姻状况6
    marital_status=''   
    for i in range(21,24):
        if len(v[i])>0:
           marital_status=v[i]    
    #职业状态7 
    occupation=''    
    for i in range(24,32):
        if len(v[i])>0:
          occupation=v[i]   
    #性取向8 
    sexual_orientation=''
    for i in range(32,35):
        if len(v[i])>0:
           sexual_orientation=v[i]
        
    v1=v[35:]
    v2=[age,gender_true,gender,parent,consumption,marital_status,occupation,sexual_orientation]
    v3=v2+v1
    sort_dic[k]=v3
ftags = codecs.open('done_yalu1115','w','utf-8')
for k,v in sort_dic.items():
    ftags.write(k+'|'+'|'.join(v)+'
')
ftags.close()
原文地址:https://www.cnblogs.com/zhangbojiangfeng/p/6077528.html