python cookbook 数据结构

保留最后n个元素：

from collections import deque

def search (lines, pattern, history=5):
  previous_lines = deque(maxlen=history)
  for li in lines:
    if pattern in li:
      yield li, previous_lines
    previous_lines.append(li)
with open(r'chap1.txt') as f:
  for line, prevlines in search(f, 'python', 5):
    for pline in prevlines:
      print(pline, end='')
    print(line, end='')
    print('-' * 20)

chap1.txt

python1
python2
python3
python4
python5
python6
python7python
sfsfsfsdf python sdfsfs

q = deque(maxlen = 3)构造一个固定大小的队列，当队列满了最新的取代最先进入的。不定义maxlen会构造一个无限的队列。

　>>> q = deque()
　>>> q . append(1)
　>>> q . append(2)
　>>> q . append(3

>>> q.appendleft(4)
>>> q
deque([4, 1, 2, 3])
>>> q . pop()
3
>>> q
deque([4, 1, 2])
>>> q . popleft()
4

查找最大或最小的N个元素

import heapq
portfolio = [
  {'name': 'IBM', 'shares': 100, 'price': 91.1},
  {'name': 'AAPL', 'shares': 50, 'price': 543.22},
  {'name': 'FB', 'shares': 200, 'price': 21.09},
  {'name': 'HPQ', 'shares': 35, 'price': 31.75},
  {'name': 'YHOO', 'shares': 45, 'price': 16.35},
  {'name': 'ACME', 'shares': 75, 'price': 115.65}
]
cheap = heapq.nsmallest(3, portfolio, key=lambda s: s['price'])
expensive = heapq.nlargest(3, portfolio, key=lambda s: s['price'])
print(cheap)
print(expensive)

　 nums = [1, 8, 2, 23, 7, - 4, 18, 23, 42, 37, 2]
print(heapq.nlargest(3, nums))
print(heapq.nsmallest(3, nums)

如果只要最大最小，min(), max()方法最合适，如果n大小和list大小差不多，sorted(items)[:N] 或者是 sorted(items)[-N:]先排序后切片最合适。
heapq永远把最小的元素放在第一个元素

>>> nums = [1, 8, 2, 23, 7, - 4, 18, 23, 42, 37, 2]
>>> import heapq
>>> heapq.heapify(nums)
>>> nums
[-4, 2, 1, 23, 7, 2, 18, 23, 42, 37, 8]

获取最小3个元素

>>> heapq . heappop(nums)
-4
>>> heapq . heappop(nums)
1
>>> heapq . heappop(nums)
2

实现一个有优先级的队列

class PriorityQueue(object):
  def __init__ (self):
    self._queue = []
    self._index = 0
  def push (self, item, priority):
    heapq.heappush(self._queue, (-priority, self._index, item))
    self._index += 1
  def pop (self):
    return heapq.heappop(self._queue)[-1]
class Item(object):
  def __init__ (self, name):
    self.name = name
  def __repr__ (self):
    return 'Item({!r})'.format(self.name)
q = PriorityQueue()
q.push(Item('foo'), 1)
q.push(Item('bar'), 5)
q.push(Item('spam'), 4)
q.push(Item('grok'), 1)
print(q.pop())
print(q.pop())
print(q.pop())
print(q.pop())

(-priority, self._index, item))将这个元组push进队列，-priority保证正数最大的永远排在第一个位置。index保证如果优先级相同时按照index排序，如果不加这个index，优先级相同的item比较时会报错。
heapq.heappop(self._queue)[-1] -1取的是元组中的最后一个也就是item。

一个字典映射多个值

  from collections import defaultdict
  d = defaultdict(list)
  d['a'].append(1)
  d['a'].append(2)
  d['b'].append(4)
  print(d['a'])
  d = defaultdict(set)
  d['a'].add(1)
  d['a'].add(2)
  d['b'].add(4)

这个是用来整洁下面这样的代码：

d = {}
for key, value in pairs:
  if key not in d:
    d[key] = []
  d[key] . append(value)

d = defaultdict(list)
for key, value in pairs:
　d[key].append(value)

字典排序：

from collections import OrderedDict
import json
def ordered_dict():
  d = OrderedDict()
  d['foo'] = 1
  d['bar'] = 2
  d['spam'] = 3
  d['grok'] = 4
  # Outputs "foo 1", "bar 2", "spam 3", "grok 4"
  for k in d:
    print(k, d[k])
  print(json.dumps(d))
ordered_dict()

OrderDict按照插入顺序排序，精确控制以JSON编码后字段的顺序非常有用。但是OrderDict是普通字典的两倍，内部维护着一个另外一个链表。需要权衡。

普通字典的运算(最大值，最小值，排序)

def dict_sort_test():
  prices = {
  'ACME': 45.23,
  'AAPL': 612.78,
  'IBM': 205.55,
  'HPQ': 37.20,
  'FB': 10.75
  }
  min_price = min(zip(prices.values(), prices.keys()))
  print(min_price)
  max_price = max(zip(prices.values(), prices.keys()))
  print(max_price)
  prices_sorted = sorted(zip(prices.values(), prices.keys()))
  print(prices_sorted)
  prices_and_names = zip(prices.values(), prices.keys())
  print (min(prices_and_names))
#   print (max(prices_and_names))

  print(min(prices, key =lambda k: prices[k])) # Returns 'FB'
  print(max(prices, key =lambda k: prices[k])) # Returns 'AAPL'
  min_value = prices[min(prices, key =lambda k: prices[k])]
  print(min_value)
dict_sort_test()

zip函数创建的是只能访问一次的迭代器。连续访问会报错：ValueError: max() arg is an empty sequence

查找两个字典的相同点

def dict_diff():
  a = {
    'x' : 1,
    'y' : 2,
    'z' : 3
  }
  b = {
    'w' : 10,
    'x' : 11,
    'y' : 2
  }
  print(a.keys() & b.keys())
  print(type(a.keys() & b.keys()))   # <class 'set'>
  print(a.keys() - b.keys())
  print(a.items() & b.items())
  c = {key:a[key] for key in a.keys() - {'z', 'w'}}   #{'x': 1, 'y': 2}
dict_diff()

可以根据已有字典生成一个不包含某些键的字典

消除序列中相同元素并保持顺序

def dedupe(items, key=None):
  seen = set()
  for item in items:
    val = item if key is None else key(item)
    if val not in seen:
      yield item
      seen.add(val)
a = [1, 5, 2, 1, 9, 1, 5, 10]
print(list(dedupe(a)))
a = [ {'x':1, 'y':2}, {'x':1, 'y':3}, {'x':1, 'y':2}, {'x':2, 'y':4}]
print(list(dedupe(a, key =lambda d: (d['x'],d['y']))))
print(list(dedupe(a, key =lambda d: d['x'])))

这个是保持了原顺序，如果不需要保持顺序只是简单的去重用set就可以了。

切片

def slice_test():
  items = [0, 1, 2, 3, 4, 5, 6]
  a = slice(2, 4)
  print(items[2:4])
  print(items[a])
  s = 'HelloWorld'
  
  a = slice(5, 50, 2)
  a.indices(len(s))    #(5, 10, 2)
  for i in range(*a.indices(len(s))):
    print (s[i])
slice_test()

值得注意的就是a定义了end为50，但实际字符串长度是10，用slice避免索引越界，会自动调整。

对单次的计数

from collections import Counter
def counter_test():
  words = [
    'look', 'into', 'my', 'eyes', 'look', 'into', 'my', 'eyes',
    'the', 'eyes', 'the', 'eyes', 'the', 'eyes', 'not', 'around', 'the',
    'eyes', "don't", 'look', 'around', 'the', 'eyes', 'look', 'into',
    'my', 'eyes', "you're", 'under'
  ]
  word_counts = Counter(words)
  print(word_counts['neverthere'])
  top_three = word_counts.most_common(3)
  print (top_three)
  
  # 对又增加的单次计数下面的两种方法任选其一

  morewords = ['why', 'are', 'you', 'not', 'looking', 'in', 'my', 'eyes']
#   print(word_counts['eyes'])
#   for word in morewords:
#     word_counts[word] += 1
#   print(word_counts['eyes'])
  
  print(word_counts['eyes'])
  word_counts . update(morewords)
  print(word_counts['eyes'])
  
  a = Counter(words)
  b = Counter(morewords)
  c = a + b
  print(c)
  d = a - b
  print(d)
counter_test()

字典按关键字排序

from operator import itemgetter
def dict_sort_keyword_test():
  rows = [
    {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003},
    {'fname': 'David', 'lname': 'Beazley', 'uid': 1002},
    {'fname': 'John', 'lname': 'Cleese', 'uid': 1001},
    {'fname': 'Big', 'lname': 'Jones', 'uid': 1004}
  ]
  # rows_by_fname = sorted(rows, key =lambda r: r['fname'])    itemgetter会快些
  rows_by_fname = sorted(rows, key=itemgetter('fname'), reverse=True)
  print (rows_by_fname)
  rows_by_uid = sorted(rows, key=itemgetter('uid'))
  print (rows_by_uid)
  rows_by_lfname = sorted(rows, key=itemgetter('lname', 'fname'))
  print (rows_by_lfname)
  print(min(rows, key = itemgetter('uid'))['uid'])

对象的按照关键字比较

from operator import attrgetter
class User :
  def __init__ (self, user_id):
    self.user_id = user_id
  def __repr__ (self):
    return 'User({})'.format(self.user_id)
def sort_notcompare ():
  users = [User(23), User(3), User(99)]
  print(users)
  print(sorted(users, key = attrgetter('user_id')))
  print(sorted(users, key =lambda u: u . user_id))
sort_notcompare()

和上面类似attrgetter更快一些。同时支持多个字段的比较

by_name = sorted(users, key = attrgetter('last_name', 'first_name'))
min(users, key = attrgetter('user_id')

通过某个字段将序列分组

rows = [
{'address': '5412 N CLARK', 'date': '07/01/2012', 'pid' : 7},
{'address': '5148 N CLARK', 'date': '07/04/2012', 'pid' : 2},
{'address': '5800 E 58TH', 'date': '07/02/2012', 'pid' : 5},
{'address': '2122 N CLARK', 'date': '07/03/2012', 'pid' : 6},
{'address': '5645 N RAVENSWOOD', 'date': '07/02/2012', 'pid' : 8},
{'address': '1060 W ADDISON', 'date': '07/02/2012', 'pid' : 7},
{'address': '4801 N BROADWAY', 'date': '07/01/2012', 'pid' : 1},
{'address': '1039 W GRANVILLE', 'date': '07/04/2012', 'pid' : 1},
]

按照date排序并分组

from operator import itemgetter
from itertools import groupby

def group_by_keyword_sort():
  rows.sort(key = itemgetter('date', 'pid'))
  # Iterate in groups
  for date, items in groupby(rows, key = itemgetter('date')):
    print(date)
    for i in items:
      print(' ', i)

groupby()函数扫描整个序列并且查找连续相同值(或者根据指定key函数返回值相同)的元素序列。在每次迭代的时候,它会返回一个值和一个迭代器对象, 这个迭代器对象可以生成元素值全部等于上面那个值的组中所有对象。所以在分组前一定要先排好序。可以实现多字段排序。

这是按照date排序了，如果不想排序，只是想放进一个大的dict里，根据key来获取最好用defaultdict

def group_by_keyword_no_sort():
  from collections import defaultdict
  rows_by_date = defaultdict(list)
  for row in rows:
    rows_by_date[row['date']].append(row)
  for g in rows_by_date:
    print(g)
    for r in rows_by_date[g]:
      print(' ',r)

从地点提取子字典

def sub_dict():
  prices = {
    'ACME': 45.23,
    'AAPL': 612.78,
    'IBM': 205.55,
    'HPQ': 37.20,
    'FB': 10.75
  }
  p1 = {key: value for key, value in prices.items() if value > 200}
  p2 = dict((key, value) for key, value in prices.items() if value > 200)
  names = {'AAPL', 'IBM', 'HPQ'}
  p3 = {key: value for key, value in prices.items() if key in names}
  p4 = {key: prices[key] for key in prices.keys() & names}

使用哪种方法根据自己习惯，p1比p2快些，p3比p4快些。

映射名称到序列元素

def nametuple_test():
  from collections import namedtuple
  Subscriber = namedtuple('Subscriber', ['addr', 'joined'])
  sub = Subscriber('fzk@example.com', '2017-03-21')
  print(sub.addr)
  print(sub.joined)

命名元祖的一个主要用途是从代码的下标解脱出来，如查询数据库。

from collections import namedtuple
Stock = namedtuple('Stock', ['name', 'shares', 'price'])
def compute_cost (records):
  total = 0.0
  for rec in records:
    s = Stock(*rec)
  total += s.shares * s.price
  return total

命名元祖的另外一个用途是作为字典的替代。但是命名元祖是不可修改的，如果需要修改用_replace()创建一个新元祖，如果经常需要修改，那么命名元祖并不合适。

sub = sub._replace(joined='2017-03-22')

可以创建一个包含缺省值的原型元祖，然后使用_replace()方法创建新的值被更新过的实例。

Stock = namedtuple('Stock', ['name', 'shares', 'price', 'date', 'time'])
stock_prototype = Stock('', 0, 0.0, None, None)
def dict_to_stock (s):
    return stock_prototype . _replace( ** s)
a = {'name': 'ACME', 'shares': 100, 'price': 123.45}
print(dict_to_stock(a))

转换并同时计算数据

nums = [1, 2, 3, 4, 5]
s = sum(x * x for x in nums)