Python--进阶处理1

# ===============Python 进阶=======================

# ---------第一章：数据结构和算法-----------

# ----------解压序列赋值给多个变量---------------
p = (4, 5)
x, y = p
print('x: ' + str(x) + ' y: ' + str(y))

data = [ 'qf', 50, 91.1, (2012, 12, 21) ]
name, shares, price, date = data
print('name: ' + name + ' date: ' + str(date))

# 如果变量个数和序列元素不匹配，将会产生一个异常
# 有时候只想解压其中的一部分，丢弃其它的值，可以使用任意变量去占位（占位变量在其它地方未使用）
data = [ 'qf', 50, 91.1, (2012, 12, 21) ]
# 使用_占位符
_, shares, price, _ = data

# 如果可迭代对象的元素个数超过变量个数，会抛出一个异常
# 这种情况可使用*号表达式
record = ('Dave', 'dave@example.com', '773-555-1212', '847-555-1212')
name, email, *phone_numbers = record
# phone_numbers变量永远是列表类型

record = ('ACME', 50, 123.45, (12, 18, 2012))
name, *_, (*_, year) = record
print('name: ' + name + ' year: ' + str(year))

# ---------------队列-----------------
from collections import deque
# 使用deque(maxlen=N)构造函数会新建一个固定大小的队列，当新元素加入并且这个队列已满的时候，最老的元素会自动被移除掉
# 队列是：先进先出
q = deque(maxlen=3)
q.append('添加一个元素')
# 删除最近加入队列的数据
q.pop()

# --------------从集合中查找最大或最小的N个元素----------------
# heapq 模块有两个函数：nlargest() 和nsmallest()
# 当要查找的元素个数相对比较小的时候适合用nlargest() 和nsmallest()
# 当只是求最大和最小值是，用max（）和min（）更快
# 当要查找的元素接近序列大小时，先sorted（）排序，再切片会更快
import heapq
nums = [1, 8, 2, 23, 7, -4, 18, 23, 42, 37, 2]
print(heapq.nlargest(3, nums)) # Prints [42, 37, 23]
print(heapq.nsmallest(3, nums)) # Prints [-4, 1, 2]

portfolio = [
{'name': 'IBM', 'shares': 100, 'price': 91.1},
{'name': 'AAPL', 'shares': 50, 'price': 543.22},
{'name': 'FB', 'shares': 200, 'price': 21.09},
{'name': 'HPQ', 'shares': 35, 'price': 31.75},
{'name': 'YHOO', 'shares': 45, 'price': 16.35},
{'name': 'ACME', 'shares': 75, 'price': 115.65}
]
# 以price的值进行比较
cheap = heapq.nsmallest(3, portfolio, key=lambda s: s['price'])
expensive = heapq.nlargest(3, portfolio, key=lambda s: s['price'])
print(cheap)
print(expensive)

# ----------实现一个优先级队列-------------
import heapq
class PriorityQueue:
    def __init__(self):
        self._queue = []
        self._index = 0

    def push(self, item, priority):
        heapq.heappush(self._queue, (-priority, self._index, item))
        self._index += 1

    def pop(self):
        return heapq.heappop(self._queue)[-1]

# -----------字典中的键映射多个值----------------
from collections import defaultdict
d = defaultdict(list)
d['a'].append(1)
d['a'].append(2)
d['b'].append(3)
print(d['b'])

d = defaultdict(set)
d['a'].add(1)
d['a'].add(2)
d['b'].add(4)
print(d['a'])

d = {} # A regular dictionary
d.setdefault('a', []).append(1)
d.setdefault('a', []).append(2)
d.setdefault('b', []).append(4)
print(d)

# --------------字典排序---------------
from collections import OrderedDict
# OrderedDict在迭代操作的时候它会保持元素被插入时的顺序
# 需要注意：rderedDict 的大小是一个普通字典的两倍，因为它内部维护着另外一个链表
d = OrderedDict()
d['a'] = 1
d['c'] = 3
d['b'] = 2
print(d)

# --------------字典的运算--------------
prices = {
    'ACME': 45.23,
    'AAPL': 612.78,
    'IBM': 205.55,
    'HPQ': 37.20,
    'FB': 10.75
}
min_price = min(zip(prices.values(), prices.keys()))
max_price = max(zip(prices.values(), prices.keys()))
prices_sorted = sorted(zip(prices.values(), prices.keys()))
# 注意：zip() 函数创建的是一个只能访问一次的迭代器
min(prices, key=lambda k: prices[k]) # Returns 'FB'
max(prices, key=lambda k: prices[k]) # Returns 'AAPL'

# -------------查找两字典的相同点--------------
a = {
    'x' : 1,
    'y' : 2,
    'z' : 3
}
b = {
    'w' : 10,
    'x' : 11,
    'y' : 2
}
# Find keys in common
a.keys() & b.keys() # { 'x', 'y' }
# Find keys in a that are not in b
a.keys() - b.keys() # { 'z' }
# Find (key,value) pairs in common
a.items() & b.items() # { ('y', 2) }
# 这些操作也可以用于修改或者过滤字典
# Make a new dictionary with certain keys removed
c = {key:a[key] for key in a.keys() - {'z', 'w'}}
# c is {'x': 1, 'y': 2}

# ----------删除序列系统元素并保持顺序----------------
def dedupe(items, key=None):
    seen = set()
    for item in items:
        val = item if key is None else key(item)
        if val not in seen:
            yield item
            seen.add(val)
a = [ {'x':1, 'y':2}, {'x':1, 'y':3}, {'x':1, 'y':2}, {'x':2, 'y':4}]
a_list = list(dedupe(a, key=lambda d: (d['x'],d['y'])))
print(a_list)

# ---------------------命名切片-------------------------
# 内置的slice() 函数创建了一个切片对象，可以被用在任何切片允许使用的地方
items = [0, 1, 2, 3, 4, 5]
a = slice(2,4)
print(items[a])

# ---------------序列中出现次数最多的元素-----------------
from collections import Counter

words = ['look', 'into', 'my', 'eyes', 'look', 'into', 'my', 'eyes',
'the', 'eyes', 'the', 'eyes', 'the', 'eyes', 'not', 'around', 'the',
'eyes', "don't", 'look', 'around', 'the', 'eyes', 'look', 'into',
'my', 'eyes', "you're", 'under']
# Counter 对象可以接受任意的hashable 序列对象
word_counts = Counter(words)
# 出现频次最高的3个单词
top_three = word_counts.most_common(3)
print(top_three)

# ---------------通过某个关键字排序字典列表------------------
# 使用operator 模块的itemgetter 函数
from operator import itemgetter
rows = [
    {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003},
    {'fname': 'David', 'lname': 'Beazley', 'uid': 1002},
    {'fname': 'John', 'lname': 'Cleese', 'uid': 1001},
    {'fname': 'Big', 'lname': 'Jones', 'uid': 1004}
]
rows_by_name = sorted(rows, key=itemgetter('fname'))
rows_by_fname_lname = sorted(rows, key=itemgetter('lname', 'fname'))
rows_by_lfname = sorted(rows, key=lambda r: (r['lname'],r['fname']))

# ---------------排序不支持原生比较的对象--------------------------
from operator import attrgetter
class User:
    def __init__(self, user_id):
        self.user_id = user_id

    def __repr__(self):
        return 'User({})'.format(self.user_id)

def sort_notcompare():
    users = [User(23), User(3), User(99)]
    print(users)
    print(sorted(users, key=lambda u: u.user_id))
    # 使用operator库的attrgetter函数
    print(sorted(users, key=attrgetter('user_id')))

sort_notcompare()

# ---------------通过某个字段将记录分组--------------------------
# 有一个字典或者实例的序列，想根据某个特定的字段来分组迭代访问
# itertools.groupby() 函数对于这样的数据分组操作非常实用
from operator import itemgetter
from itertools import groupby
rowss = [
    {'address': '5412 N CLARK', 'date': '07/01/2012'},
    {'address': '5148 N CLARK', 'date': '07/04/2012'},
    {'address': '5800 E 58TH', 'date': '07/02/2012'},
    {'address': '2122 N CLARK', 'date': '07/03/2012'},
    {'address': '5645 N RAVENSWOOD', 'date': '07/02/2012'},
    {'address': '1060 W ADDISON', 'date': '07/02/2012'},
    {'address': '4801 N BROADWAY', 'date': '07/01/2012'}
]
def groupby_rows():
    rowss.sort(key=itemgetter('date'))
    for date, items in groupby(rowss, key=itemgetter('date')):
        print(date)
        for i in items:
            print(i)

groupby_rows()

# ------------------过滤序列元素-------------------
# filter()
# itertools库中compress()

# ------------------从字典中提取子集----------------
# 1)使用字典推导
prices = {
    'ACME': 45.23,
    'AAPL': 612.78,
    'IBM': 205.55,
    'HPQ': 37.20,
    'FB': 10.75
}
p1 = {key: value for key, value in prices.items() if value > 200}
print(p1)

# ------------------合并多个字典或映射----------------
# 有多个字典或者映射，你想将它们从逻辑上合并为一个单一的映射后执行某些操作
# 使用collections 模块中的ChainMap 类
from collections import ChainMap
a = {'x':1, 'z':3}
b = {'y':2, 'z':4}
c = ChainMap(b, a)
print(c['z'])