缓存算法之LRU与LFU

1. LRU算法
1.1 背景
目前尽量由于摩尔定律，但是在存储硬件方面始终存在着差异，并且这种差异是不在同一数量级别的区别，例如在容量方面，内存<<外存；而在硬件成本与访问效率方面，内存>>外存。而目前互联网服务平台存在的特点：a. 读多写少，快速ms级响应，因此需要把数据搁在内存上；b. 数据规模巨大，长尾效应,由于数据规模巨大，只能把全量数据搁在外存上。正是由于服务场景需求与存储硬件特征之间的本身矛盾，缓存及相应的淘汰算法由此产生了：

一个在线服务平台其读取数据的过程：总是优先去离CPU最近的地方内存中读取数据，当有限的内存容量空间读取命中为空被击穿时，则会去外存的数据库或文件系统中读取；而当有限的缓存空间里“人满为患”时，而又有新的热点成员需要加入时，自然需要一定的淘汰机制。本能的基础淘汰算法：最后访问时间最久的成员最先会被淘汰(LRU)。

1.2 基本原理

由于队列具有先进先出的操作特点，所以通常用队列实现LRU，按最后访问的时间先进先出。

a. 利用队列类型对象，记录最近操作的元素,总是放在队首，这样最久未操作的元素自然被相对移动到队尾；同时，当元素总数达到上限值时，优先移除与淘汰队尾元素。
b. 利用HashMap辅助对象，快速检索队列中存在的元素。
1.3 操作
a. 写入操作：新建一个元素，把元素插入队列首，当元素总和达到上限值时，同时删除队尾元素。
b. 读取操作：利用map快速检索，队列相关联的元素。
1.4 实现源码
a. 自定义链表方式

  1 // A simple LRU cache written in C++
  2 // Hash map + doubly linked list
  3 #include <iostream>
  4 #include <vector>
  5 #include <ext/hash_map>
  6 using namespace std;
  7 using namespace __gnu_cxx;
  8 
  9 template <class K, class T>
 10 struct Node{
 11   K key;
 12   T data;
 13   Node *prev, *next;
 14 };
 15 
 16 template <class K, class T>
 17 class LRUCache{
 18  public:
 19   LRUCache(size_t size){
 20     entries_ = new Node<K,T>[size];
 21     for(int i=0; i<size; ++i)// 存储可用结点的地址
 22       free_entries_.push_back(entries_+i);
 23     head_ = new Node<K,T>;
 24     tail_ = new Node<K,T>;
 25     head_->prev = NULL;
 26     head_->next = tail_;
 27     tail_->prev = head_;
 28     tail_->next = NULL;
 29   }
 30   ~LRUCache(){
 31     delete head_;
 32     delete tail_;
 33     delete[] entries_;
 34   }
 35   void Put(K key, T data){
 36     Node<K,T> *node = hashmap_[key];
 37     if(node){ // node exists
 38       detach(node);
 39       node->data = data;
 40       attach(node);
 41     }
 42     else
 43     {
 44       if(free_entries_.empty())
 45       {// 可用结点为空，即cache已满
 46         node = tail_->prev;
 47         detach(node);
 48         hashmap_.erase(node->key);
 49       }
 50       else{
 51         node = free_entries_.back();
 52         free_entries_.pop_back();
 53       }
 54       node->key = key;
 55       node->data = data;
 56       hashmap_[key] = node;
 57       attach(node);
 58     }
 59   }
 60   
 61   T Get(K key){
 62     Node<K,T> *node = hashmap_[key];
 63     if(node){
 64       detach(node);
 65       attach(node);
 66       return node->data;
 67     }
 68     else{// 如果cache中没有，返回T的默认值。与hashmap行为一致
 69       return T();
 70     }
 71   }
 72  private:
 73   // 分离结点
 74   void detach(Node<K,T>* node){
 75     node->prev->next = node->next;
 76     node->next->prev = node->prev;
 77   }
 78   // 将结点插入头部
 79   void attach(Node<K,T>* node){
 80     node->prev = head_;
 81     node->next = head_->next;
 82     head_->next = node;
 83     node->next->prev = node;
 84   }
 85  private:
 86   hash_map<K, Node<K,T>* > hashmap_;
 87   vector<Node<K,T>* > free_entries_; // 存储可用结点的地址
 88   Node<K,T> *head_, *tail_;
 89   Node<K,T> *entries_; // 双向链表中的结点
 90 };
 91 
 92 int main(){
 93   hash_map<int, int> map;
 94   map[9]= 999;
 95   cout<<map[9]<<endl;
 96   cout<<map[10]<<endl;
 97   LRUCache<int, string> lru_cache(100);
 98   lru_cache.Put(1, "one");
 99   cout<<lru_cache.Get(1)<<endl;
100   if(lru_cache.Get(2) == "")
101     lru_cache.Put(2, "two");
102   cout<<lru_cache.Get(2);
103   return 0;
104 }

b. 采用stl::list类型实现方式

  1 #include <iostream>
  2 #include <list>
  3 #include <ext/hash_map>
  4 #include <stdint.h>
  5 #include <string>
  6 
  7 template<class T1, class T2>
  8 class Lru
  9 {
 10  public:
 11   typedef std::list<std::pair<T1, T2> > List;
 12   typedef typename List::iterator iterator;
 13   typedef __gnu_cxx::hash_map<T1, iterator> Map;
 14 
 15   Lru()
 16   {
 17     size_ = 1000;
 18     resize(size_);
 19   }
 20 
 21   ~Lru()
 22   {
 23     clear();
 24   }
 25 
 26   void resize(int32_t size)
 27   {
 28     if (size > 0)
 29     {
 30       size_ = size;
 31     }
 32   }
 33 
 34   T2* find(const T1& first)
 35   {
 36     typename Map::iterator i = index_.find(first);
 37 
 38     if (i == index_.end())
 39     {
 40       return NULL;
 41     }
 42     else
 43     {
 44       typename List::iterator n = i->second;
 45       list_.splice(list_.begin(), list_, n);
 46       return &(list_.front().second);
 47     }
 48   }
 49 
 50   void remove(const T1& first)
 51   {
 52     typename Map::iterator i = index_.find(first);
 53     if (i != index_.end())
 54     {
 55       typename List::iterator n = i->second;
 56       list_.erase(n);
 57       index_.erase(i);
 58     }
 59   }
 60 
 61   void insert(const T1& first, const T2& second)
 62   {
 63     typename Map::iterator i = index_.find(first);
 64     if (i != index_.end())
 65     { // found
 66       typename List::iterator n = i->second;
 67       list_.splice(list_.begin(), list_, n);
 68       index_.erase(n->first);
 69       n->first = first;
 70       n->second = second;
 71       index_[first] = n;
 72     }
 73     else if (size() >= size_ )
 74     { // erase the last element
 75       typename List::iterator n = list_.end();
 76       --n; // the last element
 77       list_.splice(list_.begin(), list_, n);
 78       index_.erase(n->first);
 79       n->first = first;
 80       n->second = second;
 81       index_[first] = n;
 82     }
 83     else
 84     {
 85       list_.push_front(std::make_pair(first, second));
 86       typename List::iterator n = list_.begin();
 87       index_[first] = n;
 88     }
 89   }
 90 
 91   /// Random access to items
 92   iterator begin()
 93   {
 94     return list_.begin();
 95   }
 96 
 97   iterator end()
 98   {
 99     return list_.end();
100   }
101 
102   int size()
103   {
104     return index_.size();
105   }
106 
107   // Clear cache
108   void clear()
109   {
110     index_.clear();
111     list_.clear();
112   }
113 
114  private:
115   int32_t size_;
116   List list_;
117   Map index_;
118 };
119 
120 
121 
122 int main(void)
123 {
124   std::cout << "hello world " << std::endl;
125   tfs::Lru<uint32_t, std::string> name_cache;
126   name_cache.insert(1, "one");
127   name_cache.insert(2, "two");
128 
129   std::string* value = name_cache.find(1);
130   const char* v = value->c_str();
131   std::cout << "result of the key 1 is: " << *name_cache.find(1) << std::endl;
132       
133   return 0;
134 }

2. LRFU缓存算法

2.1 前言
缓存算法有许多种，各种缓存算法的核心区别在于它们的淘汰机制。通常的淘汰机制：所有元素按某种量化的重要程度进行排序，分值最低者则会被优先淘汰。而重要程度的量化指标又通常可以参考了两个维度：最后被访问的时间和最近被访问的频率次数。依次例如如下：

1. LRU(Least Recently Used ),总是淘汰最后访问时间最久的元素。

这种算法存在着问题：可能由于一次冷数据的批量查询而误淘汰大量热点的数据。

2. LFU(Least Frequently Used ),总是淘汰最近访问频率最小的元素。

这种算法也存在明显的问题: a. 如果频率时间度量是1小时，则平均一天每个小时内的访问频率1000的热点数据可能会被2个小时的一段时间内的访问频率是1001的数据剔除掉；b. 最近新加入的数据总会易于被剔除掉，由于其起始的频率值低。本质上其“重要性”指标访问频率是指在多长的时间维度进行衡量？其难以标定，所以在业界很少单一直接使用。也由于两种算法的各自特点及缺点，所以通常在生产线上会根据业务场景联合LRU与LFU一起使用，称之为LRFU。

2.2 实现机制
正是由于LRU与LFU算法各具特点，所以在行业生产线上通常会联合两者使用。我们可以在LRU的实现基础上稍作衍生，能实现LRFU缓存机制，即可以采用队列分级的思想，例如oracle利用两个队列维护访问的数据元素，按被访问的频率的维度把元素分别搁在热端与冷端队列；而在同一个队列内，最后访问时间越久的元素会越被排在队列尾。

参考：
1.   https://en.wikipedia.org/wiki/Least_frequently_used
2.   http://code.taobao.org/p/tfs/src/
3.   http://www.hawstein.com/posts/lru-cache-impl.html