STL源码笔记1 —— allocators

简述

allocators是STL中很重要的一个幕后英雄的角色，STL中的容器在使用过程中需要不断的放元素进去和取元素出来，而在此过程中，如何更高效的申请和释放内存是十分影响STL容器的性能的。

operator new() 和 malloc()

首先内存的分配动作，一层层调用下去，最终到了CRT的层面上都是调用malloc()来分配，而malloc再根据所在的操作系统，调用不同的操作系统api才能真正的拿到内存。下面是vs2015和gcc 2.95的源码，里面的 operator new 最终调用 malloc 进行内存分配：

//VS2015中，Microsoft Visual Studio 14.0VCcrtsrcvcruntime
ew_debug.cpp中的源码
//debug模式的_malloc_dbg
void* __CRTDECL operator new(
    size_t const size,
    int const    block_use,
    char const*  file_name,
    int const    line_number
    )
{
    for (;;)
    {
        if (void* const block = _malloc_dbg(size, block_use, file_name, line_number))
        {
            return block;
        }

        if (_callnewh(size) == 0)
        {
            if (size == SIZE_MAX)
            {
                __scrt_throw_std_bad_array_new_length();
            }
            else
            {
                __scrt_throw_std_bad_alloc();
            }
        }
    }
}

//VS2015中，Microsoft Visual Studio 14.0VCcrtsrclinkopts
othrownew.cpp中的源码
//不抛异常的new 直接调用malloc
void* operator new(size_t size)
{
    for (;;)
    {
        if (void* const block = malloc(size))
            return block;

        if (_callnewh(size) == 0)
            return nullptr;

        // The new handler was successful; try to allocate again...
    }
}


//在gcc 2.95里面 gcc-2.95.1gcccp
ew1.cc
void * operator new (size_t sz, const std::nothrow_t&) throw()
{
  void *p;

  /* malloc (0) is unpredictable; avoid it.  */
  if (sz == 0)
    sz = 1;
  p = (void *) malloc (sz);
  while (p == 0)
    {
      new_handler handler = __new_handler;
      if (! handler)
	return 0;
      try
	{
	  handler ();
	}
      catch (bad_alloc &)
	{
	  return 0;
	}

      p = (void *) malloc (sz);
    }

  return p;
}

void * operator new (size_t sz) throw (std::bad_alloc)
{
  void *p;

  /* malloc (0) is unpredictable; avoid it.  */
  if (sz == 0)
    sz = 1;
  p = (void *) malloc (sz);
  while (p == 0)
    {
      new_handler handler = __new_handler;
      if (! handler)
	throw bad_alloc ();
      handler ();
      p = (void *) malloc (sz);
    }

  return p;
}

然而malloc分配的内存如果在debug模式下，会有许多额外的信息（包括大小、前后块指针、使用情况等信息），而即使是在release模式下，也至少会有标识大小的字节被占用。那么每次申请的内存就会有额外的开销，如果申请的空间很小，额外的开销占比就会很大。因此，产生了一种使用内存管理，减少这种开销的想法，这也是STL的allocators分配器最核心的功能。

VS2015 中的allocator

在VS中，几个容器的设计是这样的：

//VS中的vector
template<class _Ty,
	class _Alloc = allocator<_Ty> >
	class vector
		: public _Vector_alloc<_Vec_base_types<_Ty, _Alloc> >
	{	// varying size array of values
        ...
    };

//VS中的list
template<class _Ty,
	class _Alloc = allocator<_Ty> >
	class list
		: public _List_buy<_Ty, _Alloc>
	{	// bidirectional linked list
        ...
    };

可以看到在VS的容器里，默认使用的是allocator这个class，那么再去 Microsoft Visual Studio 14.0VCincludexmemory0 观察allocator的实现：

template<class _Ty>
	class allocator
	{	// generic allocator for objects of class _Ty
public:
    ...
    
	_DECLSPEC_ALLOCATOR pointer allocate(size_type _Count)
		{	// allocate array of _Count elements
		return (static_cast<pointer>(_Allocate(_Count, sizeof (_Ty))));
		}

	_DECLSPEC_ALLOCATOR pointer allocate(size_type _Count, const void *)
		{	// allocate array of _Count elements, ignore hint
		return (allocate(_Count));
		}
    };

allocate调用了_Allocate，再去看_Allocate的实现：

	_DECLSPEC_ALLOCATOR void *_Allocate(size_t _Count, size_t _Sz,
		bool _Try_aligned_allocation = true)
	{	// allocate storage for _Count elements of size _Sz
	void *_Ptr = 0;

	if (_Count == 0)
		return (_Ptr);

	// check overflow of multiply
	if ((size_t)(-1) / _Sz < _Count)
		_Xbad_alloc();	// report no memory
	const size_t _User_size = _Count * _Sz;

 #if defined(_M_IX86) || defined(_M_X64)
	if (_Try_aligned_allocation
		&& _BIG_ALLOCATION_THRESHOLD <= _User_size)
		{	// allocate large block
		static_assert(sizeof (void *) < _BIG_ALLOCATION_ALIGNMENT,
			"Big allocations should at least match vector register size");
		const size_t _Block_size = _NON_USER_SIZE + _User_size;
		if (_Block_size <= _User_size)
			_Xbad_alloc();	// report no memory
		const uintptr_t _Ptr_container =
			reinterpret_cast<uintptr_t>(::operator new(_Block_size));
		_SCL_SECURE_ALWAYS_VALIDATE(_Ptr_container != 0);
		_Ptr = reinterpret_cast<void *>((_Ptr_container + _NON_USER_SIZE)
			& ~(_BIG_ALLOCATION_ALIGNMENT - 1));
		static_cast<uintptr_t *>(_Ptr)[-1] = _Ptr_container;

 #ifdef _DEBUG
		static_cast<uintptr_t *>(_Ptr)[-2] = _BIG_ALLOCATION_SENTINEL;
 #endif /* _DEBUG */
		}
	else
 #endif /* defined(_M_IX86) || defined(_M_X64) */

		{	// allocate normal block
		_Ptr = ::operator new(_User_size);
		_SCL_SECURE_ALWAYS_VALIDATE(_Ptr != 0);
		}
	return (_Ptr);
	}

里面就是调用_Xbad_alloc()或者 ::operator new 来实现内存的分配，所以实际上VS并没有对STL的allocator做特别的优化。
同样的，通过查看 deallocate() 对应的源代码，也看出来VS在释放的时候也只是一个对 operator delete 的封装而已。因此，可以认为VS在这一方面并没有做特殊设计。

GCC2.9 中的allocator

同样，我们也去看GCC中容器的实现：

//gcc-2.95.1libstdc++stlstl_vector.h
template <class _Tp, class _Alloc = __STL_DEFAULT_ALLOCATOR(_Tp) >
class vector : protected _Vector_base<_Tp, _Alloc> 
{
    ···
};

//gcc-2.95.1libstdc++stlstl_list.h
template <class _Tp, class _Alloc = __STL_DEFAULT_ALLOCATOR(_Tp) >
class list : protected _List_base<_Tp, _Alloc> {
    ...
};

//在stl_config.h中
# ifndef __STL_DEFAULT_ALLOCATOR
#   ifdef __STL_USE_STD_ALLOCATORS
#     define __STL_DEFAULT_ALLOCATOR(T) allocator<T>
#   else
#     define __STL_DEFAULT_ALLOCATOR(T) alloc
#   endif
# endif

可以看到，宏 __STL_DEFAULT_ALLOCATOR，如果没有特别说明的情况下，是使用alloc的，那么我们来看看alloc类的实现：

//默认情况下 __NODE_ALLOCATOR_THREADS 为false，所以是单线程的
# ifdef _NOTHREADS
#   define __NODE_ALLOCATOR_LOCK
#   define __NODE_ALLOCATOR_UNLOCK
#   define __NODE_ALLOCATOR_THREADS false
#   define __VOLATILE
# endif
typedef __default_alloc_template<__NODE_ALLOCATOR_THREADS, 0> alloc;

template <bool threads, int inst>
class __default_alloc_template {
    ...
private:
    enum {_ALIGN = 8};
    enum {_MAX_BYTES = 128};
    enum {_NFREELISTS = _MAX_BYTES/_ALIGN};

    union _Obj {
        union _Obj* _M_free_list_link;
        char _M_client_data[1];    /* The client sees this.        */
    };

    static _Obj* __VOLATILE _S_free_list[_NFREELISTS];
    // Chunk allocation state.
    static char* _S_start_free;
    static char* _S_end_free;
    static size_t _S_heap_size;

    ...
};

借用侯捷老师在课程中用的图，其实在gcc的alloc中，设计了一个16条链表的数组 _S_free_list ，每一条链表负责某个大小的特定区块的分配，分别负责从8个字节到128个字节的区块。当容器需要分配内存的时候，都是从这个分配器中去申请，然后大小向上调整到8的倍数（例如120字节会调整到128字节），然后到分配器里再去对应的链表中搜索是否有空闲的区块（例如128字节需要到 __S_free_list[15] 里面搜索），如果该链表中没有挂着空闲内存，才会通过malloc向系统申请一大块内存再切割。

avatar

同样，再往下看，看看该类中的 allocate 和 deallocate的实现：

template <bool threads, int inst>
class __default_alloc_template {
    ...

      static  size_t _S_freelist_index(size_t __bytes) {
        return (((__bytes) + _ALIGN-1)/_ALIGN - 1);
      }

    public:
      /* __n must be > 0      */
      static void* allocate(size_t __n)
      {
        _Obj* __VOLATILE* __my_free_list;
        _Obj* __RESTRICT __result;

        if (__n > (size_t) _MAX_BYTES) {
            return(malloc_alloc::allocate(__n));
        }
        __my_free_list = _S_free_list + _S_freelist_index(__n);

        __result = *__my_free_list;
        if (__result == 0) {
            void* __r = _S_refill(_S_round_up(__n));
            return __r;
        }
        *__my_free_list = __result -> _M_free_list_link;
        return (__result);
      };

      /* __p may not be 0 */
      static void deallocate(void* __p, size_t __n)
      {
        _Obj* __q = (_Obj*)__p;
        _Obj* __VOLATILE* __my_free_list;

        if (__n > (size_t) _MAX_BYTES) {
            malloc_alloc::deallocate(__p, __n);
            return;
        }
        __my_free_list = _S_free_list + _S_freelist_index(__n);

        __q -> _M_free_list_link = *__my_free_list;
        *__my_free_list = __q;
        // lock is released here
      }

  ...
};

_S_freelist_index(__n) 这个函数可以计算出需要的空间会落在 _S_free_list 这个数组的哪个index上，再从中取出一块空余内存分配。而如果没有空余内存，则会调用 _S_refill(_S_round_up(__n)) 该函数重新分配一段内存，源代码如下：

template <bool __threads, int __inst>
void*
__default_alloc_template<__threads, __inst>::_S_refill(size_t __n)
{
    int __nobjs = 20;
    char* __chunk = _S_chunk_alloc(__n, __nobjs);
    _Obj* __VOLATILE* __my_free_list;
    _Obj* __result;
    _Obj* __current_obj;
    _Obj* __next_obj;
    int __i;

    if (1 == __nobjs) return(__chunk);
    __my_free_list = _S_free_list + _S_freelist_index(__n);

    /* Build free list in chunk */
      __result = (_Obj*)__chunk;
      *__my_free_list = __next_obj = (_Obj*)(__chunk + __n);
      for (__i = 1; ; __i++) {
        __current_obj = __next_obj;
        __next_obj = (_Obj*)((char*)__next_obj + __n);
        if (__nobjs - 1 == __i) {
            __current_obj -> _M_free_list_link = 0;
            break;
        } else {
            __current_obj -> _M_free_list_link = __next_obj;
        }
      }
    return(__result);
}

其中，真正申请空间的函数就是 _S_chunk_alloc(__n, __nobjs) ，其中 __nobjs = 20，阅读下面的代码可以看到如果重新分配一块内存，会申请一块20倍大小再多一点的内存：

size_t __total_bytes = __size * __nobjs;
size_t __bytes_to_get = 2 * __total_bytes + _S_round_up(_S_heap_size >> 4);

然后再从该空间中分出一块作为当前分配，具体源码如下：

template <bool __threads, int __inst>
char*
__default_alloc_template<__threads, __inst>::_S_chunk_alloc(size_t __size,
                                                            int& __nobjs)
{
    char* __result;
    size_t __total_bytes = __size * __nobjs;
    size_t __bytes_left = _S_end_free - _S_start_free;

    if (__bytes_left >= __total_bytes) {
        __result = _S_start_free;
        _S_start_free += __total_bytes;
        return(__result);
    } else if (__bytes_left >= __size) {
        __nobjs = (int)(__bytes_left/__size);
        __total_bytes = __size * __nobjs;
        __result = _S_start_free;
        _S_start_free += __total_bytes;
        return(__result);
    } else {
        size_t __bytes_to_get =
	  2 * __total_bytes + _S_round_up(_S_heap_size >> 4);
        // Try to make use of the left-over piece.
        if (__bytes_left > 0) {
            _Obj* __VOLATILE* __my_free_list =
                        _S_free_list + _S_freelist_index(__bytes_left);

            ((_Obj*)_S_start_free) -> _M_free_list_link = *__my_free_list;
            *__my_free_list = (_Obj*)_S_start_free;
        }
        _S_start_free = (char*)malloc(__bytes_to_get);
        if (0 == _S_start_free) {
            size_t __i;
            _Obj* __VOLATILE* __my_free_list;
	    _Obj* __p;
            // Try to make do with what we have.  That can't
            // hurt.  We do not try smaller requests, since that tends
            // to result in disaster on multi-process machines.
            for (__i = __size; __i <= _MAX_BYTES; __i += _ALIGN) {
                __my_free_list = _S_free_list + _S_freelist_index(__i);
                __p = *__my_free_list;
                if (0 != __p) {
                    *__my_free_list = __p -> _M_free_list_link;
                    _S_start_free = (char*)__p;
                    _S_end_free = _S_start_free + __i;
                    return(_S_chunk_alloc(__size, __nobjs));
                    // Any leftover piece will eventually make it to the
                    // right free list.
                }
            }
	    _S_end_free = 0;	// In case of exception.
            _S_start_free = (char*)malloc_alloc::allocate(__bytes_to_get);
            // This should either throw an
            // exception or remedy the situation.  Thus we assume it
            // succeeded.
        }
        _S_heap_size += __bytes_to_get;
        _S_end_free = _S_start_free + __bytes_to_get;
        return(_S_chunk_alloc(__size, __nobjs));
    }
}