netflix数据处理2(转)

原始数据：
$head -10 mv_0006890.txt
6890:
1735266,1,2004-04-02
1008399,1,2004-06-22
2360117,2,2003-11-08
1294425,2,2004-03-15
439931,4,2004-03-27
1583311,1,2004-03-11
2431832,3,2005-02-13
620771,2,2004-03-20
1110906,1,2004-03-04

结果数据：user_id movie_id rating
$head -10 ratings_0.txt
499040 9419 3
2071637 9419 4
896780 9419 3
2625420 9419 2
652121 9419 3
1003291 9419 4
818736 9419 3
332152 9419 2
2174771 9419 4
47411 9419 5

import sys
import os
import re

CHUNK_FILES = True

def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
    else:
        pass

def main(args):
    outfile = open('reformatted_movie_titles.txt', 'w')
    movie_title_file = open('movie_titles.txt','r')
    movie_title_exp=re.compile("([\w]+),([\w]+),(.*)")
    movie_titles={}
    for line in movie_title_file:
      m = movie_title_exp.match(line.strip())
      outfile.write('%s\t%s\n' % (m.group(1), m.group(3)))
    outfile.close()
    movie_title_file.close()

    in_dir= args[1] #'输入文件路径'
    out_dir = args[2] #'输出文件路径'
    filenames = [in_dir +'/' + file for file in os.listdir(in_dir)]
    rating_count = 0
    L = 0
    outfile_num = 0
    mkdir(out_dir)
    outfilename = out_dir+ '/' + 'ratings_'+ str(outfile_num) +'.txt'
    output_file = open(outfilename, 'w')
    for i, moviefile in enumerate(filenames):
        print "processing movie %s " % (i+1)
        f = open(moviefile,'r')
        for j, line in enumerate(f.readlines()):
            if j == 0:
                movieid = line.split(':')[0]
            else:
                (userid, rating, date) = line.split(',')
                nextline = ' '.join([userid, movieid, rating+'\n'])
                L += len(nextline) # 如果长度达到 65536, 新建一个文件
                if L/1000 > 65536 and CHUNK_FILES:
                    output_file.close()
                    outfile_num += 1
                    outfilename = out_dir+ '/' + \
                    'ratings_'+ str(outfile_num) +'.txt'
                    print "--- starting new file: %s" % outfilename
                    output_file = open(outfilename, 'w')
                    L = len(nextline)
                output_file.write(nextline)
                rating_count += 1
        f.close()
    output_file.close()


if __name__ == '__main__':
    main(sys.argv)

经过处理，得到多个用户评分数据集，合并到一个文件
#!/bin/bash
for x in netflix-data/ratings_*.txt ;
do cat $x >> result.txt ;
done &

$head -10 result.txt
499040 9419 3
2071637 9419 4
896780 9419 3
2625420 9419 2
652121 9419 3
1003291 9419 4
818736 9419 3
332152 9419 2
2174771 9419 4
47411 9419 5