libin_job_01_reduce.py

robby_map.py.bak   
#!/bin/python
#encoding=utf-8
import sys
import os
import time

MAP_COUNT=23

def map_process(istream, ostream) :
#    print "HERE"
    if (len(sys.argv)==3):
        time_stamp = sys.argv[1] 
        file_name = sys.argv[2]
    else:
        return 1
    file_rule = open(file_name, 'r')
    rule_list = []
    for line in file_rule:
        if(line.rstrip(' ').strip(' ')!=""):
            items = line.rstrip(' ').split(' ')
            rule_list.append(tuple(items))
    file_rule.close()    
    #print rule_list
    err_line_cnt  = 0
    line_cnt      = 0
    for line in istream :
        line_cnt += 1
        record = line.rstrip(' ').split(' ')
        if (len(record) < MAP_COUNT) : 
            err_line_cnt += 1
            continue
        userid = record[1]
        ideaid = record[4]
        wordid = record[3]
        price  = record[9]
        query  = record[11]
        pid    = record[16]
        channel = record[19]
       # print channel
       # print "GG"
        for rule in rule_list :
            rule_packid = rule[0]
            rule_userid = rule[1]
            if(userid == rule_userid) :
                ostream.write('query %s %s %s %s %s '%(rule_packid, pid, userid, channel, price))
                continue
            else :
                pass
    print >> sys.stderr, "%s %s: totally %u lines processed with %u error lines" %
    (time.asctime(time.localtime(time.time())), os.path.basename(__file__), line_cnt, err_line_cnt)
                
if __name__ == '__main__':
    #map_process(sys.stdin, sys.stdout)
    try :
        map_process(sys.stdin, sys.stdout)
    except Exception, e :                                                       
        print >> sys.stderr, "%s %s: failed to process file, [err_msg:%s]"
        % (time.asctime(time.localtime(time.time())), os.path.basename(__file__), e)
        sys.exit(-1)
    sys.exit(0)

[work@yx-testing-ecom124.vm.baidu.com hadoop_prac]$ cat robby_reduce.py.bak
#!/bin/python
#encoding=utf-8
#/***************************************************************************
# *
# * Copyright (c) 2013 Baidu.com, Inc. All Rights Reserved
# *
# **************************************************************************/
#
#/**
#  * @file    libin_job_01_reduce.py
#  * @author  chenlibin(chenlibin@baidu.com)
#  * @date    2013-07-21
#**/

# input      rule,userid,'word',wordid,pid,shw,clk,price
# output     rule,userid,pid,shw,clk,price
# input ostream.write('query	%s	%s	%s	%s	%s	%s
'%(rule,userid,pid,shw,clk,price))
# output rule,userid,pid,shw,clk,price
import sys
import os
import time

MAPCOUNT   = 6

MAP_TYPE   = 0
MAP_PACKID = 1
MAP_PID    = 2
MAP_USERID = 3
MAP_CHANNEL = 4
MAP_PRICE  = 5
#MAP_TIME   = 6

PACKID    = 0
PID     = 1
USERID  = 2
CHANNEL = 3
PRICE   = 4
#TIME    = 5

dic = {}
def output_cacheA(cache, ostream) :
    if cache[PACKID] is not None:
        ostream.write("%s	%s	%s	%d	#A
"%(cache[PACKID], cache[USERID], cache[PID], cache[PRICE]))
        return True
    return False
def output_cacheB(cache, ostream) :
    if cache[PACKID] is not None:
        ostream.write("%s	%s	%s	%d	%d	%d	%d	#B
"%(cache[PACKID], cache[USERID], cache[PID], dic[201], dic[204], dic[225], dic[227]))
        return True
    return False
def load_cacheA(record, cacheA) :
    cacheA[PACKID]      = record[MAP_PACKID]
    cacheA[USERID]    = record[MAP_USERID]
    cacheA[PID]       = record[MAP_PID]
    cacheA[PRICE]     = int(record[MAP_PRICE])
    #cacheA[TIME]     = record[MAP_TIME]
def load_cacheB(record, cacheB):
    cacheB[PACKID]  = record[MAP_PACKID]
    cacheB[PID]       = record[MAP_PID]
    cacheB[USERID]    = record[MAP_USERID]
    cacheB[PRICE]     = int(record[MAP_PRICE])
    cacheB[CHANNEL]   = int(record[MAP_CHANNEL])
    dic[cacheB[CHANNEL]] = cacheB[PRICE]
    #cacheB[TIME]     = record[MAP_TIME]
def reduce_process(istream, ostream) :
    cacheA = [None, None, None,0,0]
    cacheB = [None, None, None,0,0,0]
    line_cnt = 0
    err_line_cnt = 0
    #print "GOOD"    
    for line in istream :
        try :
            line_cnt += 1
            record = line.rstrip('
').split('	')
            if (len(record) < MAPCOUNT) :
                err_line_cnt += 1
                print "error line:%s"%(line)
                continue
#print "line:%s"%(record)
            if ((record[MAP_USERID]!=cacheB[USERID]) or (cacheB[PID]!=record[MAP_PID]) or (cacheB[CHANNEL]!=int(record[MAP_CHANNEL]))):
                if ((cacheB[PID]!=record[MAP_PID]) or (record[MAP_USERID]!=cacheB[USERID])):
                    output_cacheA(cacheA,ostream)
                    output_cacheB(cacheB,ostream)
                    load_cacheA(record, cacheA)
                    dic.clear()
                    dic[201] = 0
                    dic[204] = 0
                    dic[225] = 0
                    dic[227] =0
                    load_cacheB(record, cacheB)
                elif (int(record[MAP_CHANNEL])!=cacheB[CHANNEL]):
                    load_cacheB(record, cacheB)
                    cacheA[PRICE] += int(record[MAP_PRICE])
            else :
                cacheA[PRICE] += int(record[MAP_PRICE])
                cacheB[PRICE] = int(record[MAP_PRICE])
                #dic[cacheB[CHANNEL]] += cacheB[PRICE]
                dic[cacheB[CHANNEL]] += cacheB[PRICE]

#print 'else %s' %(cacheA)
        except Exception, e :
            print >> sys.stderr, "%s	%s: failed in line#%d, [err_msg:%s]" % 
                (time.asctime(time.localtime(time.time())), os.path.basename(__file__), line_cnt, e)
    output_cacheA(cacheA,ostream)
    output_cacheB(cacheB,ostream)
    print >> sys.stderr, "%s	%s: totally %u lines processed with %u error lines" % 
        (time.asctime(time.localtime(time.time())), os.path.basename(__file__), line_cnt, err_line_cnt)


if __name__ == '__main__':
    try :
        reduce_process(sys.stdin, sys.stdout)
    except Exception, e :
        print >> sys.stderr, "%s	%s: failed to process file, [err_msg:%s]" 
                % (time.asctime(time.localtime(time.time())), os.path.basename(__file__), e)
        sys.exit(-1)

    sys.exit(0)

原文地址:https://www.cnblogs.com/robbychan/p/3786650.html