memcached源码学习之工程文件分析

memcached源码版本：memcached-1.4.24

1. 与 hash 相关的文件

hash.h / hash.c

就一个函数 hash_init(enum hashfunc_type type) ，main函数调用，用来选择使用的 hash 函数。

hash.h

 1 typedef uint32_t (*hash_func)(const void *key, size_t length);
 2 hash_func hash;
 3 
 4 enum hashfunc_type 
 5 {
 6     JENKINS_HASH=0, 
 7     MURMUR3_HASH
 8 };
 9 
10 int hash_init(enum hashfunc_type type);

View Code

hash.c

 1 #include "memcached.h"
 2 #include "jenkins_hash.h"
 3 #include "murmur3_hash.h"
 4 
 5 int hash_init(enum hashfunc_type type) 
 6 {
 7     switch(type) 
 8     {
 9         case JENKINS_HASH:
10             hash = jenkins_hash;
11             settings.hash_algorithm = "jenkins";
12             break;
13         case MURMUR3_HASH:
14             hash = MurmurHash3_x86_32;
15             settings.hash_algorithm = "murmur3";
16             break;
17         default:
18             return -1;
19     }
20     return 0;
21 }

View Code

memcached提供了两种 hash 函数，分别定义在下面两个源文件中。

jenkins_hash.h / jenkins_hash.c

jenkins_hash.h

1 uint32_t jenkins_hash(const void *key, size_t length);

jenkins_hash.c

/* -*- Mode: C; tab- 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
/*
 * Hash table
 *
 * The hash function used here is by Bob Jenkins, 1996:
 *    <http://burtleburtle.net/bob/hash/doobs.html>
 *       "By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.
 *       You may use this code any way you wish, private, educational,
 *       or commercial.  It's free."
 *
 */
#include "memcached.h"
#include "jenkins_hash.h"

/*
 * Since the hash function does bit manipulation, it needs to know
 * whether it's big or little-endian. ENDIAN_LITTLE and ENDIAN_BIG
 * are set in the configure script.
 */
#if ENDIAN_BIG == 1
# define HASH_LITTLE_ENDIAN 0
# define HASH_BIG_ENDIAN 1
#else
# if ENDIAN_LITTLE == 1
#  define HASH_LITTLE_ENDIAN 1
#  define HASH_BIG_ENDIAN 0
# else
#  define HASH_LITTLE_ENDIAN 0
#  define HASH_BIG_ENDIAN 0
# endif
#endif

#define rot(x,k) (((x)<<(k)) ^ ((x)>>(32-(k))))

/*
-------------------------------------------------------------------------------
mix -- mix 3 32-bit values reversibly.

This is reversible, so any information in (a,b,c) before mix() is
still in (a,b,c) after mix().

If four pairs of (a,b,c) inputs are run through mix(), or through
mix() in reverse, there are at least 32 bits of the output that
are sometimes the same for one pair and different for another pair.
This was tested for:
* pairs that differed by one bit, by two bits, in any combination
  of top bits of (a,b,c), or in any combination of bottom bits of
  (a,b,c).
* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
  is commonly produced by subtraction) look like a single 1-bit
  difference.
* the base values were pseudorandom, all zero but one bit set, or
  all zero plus a counter that starts at zero.

Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
satisfy this are
    4  6  8 16 19  4
    9 15  3 18 27 15
   14  9  3  7 17  3
Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
for "differ" defined as + with a one-bit base and a two-bit delta.  I
used http://burtleburtle.net/bob/hash/avalanche.html to choose
the operations, constants, and arrangements of the variables.

This does not achieve avalanche.  There are input bits of (a,b,c)
that fail to affect some output bits of (a,b,c), especially of a.  The
most thoroughly mixed value is c, but it doesn't really even achieve
avalanche in c.

This allows some parallelism.  Read-after-writes are good at doubling
the number of bits affected, so the goal of mixing pulls in the opposite
direction as the goal of parallelism.  I did what I could.  Rotates
seem to cost as much as shifts on every machine I could lay my hands
on, and rotates are much kinder to the top and bottom bits, so I used
rotates.
-------------------------------------------------------------------------------
*/
#define mix(a,b,c) 
{ 
  a -= c;  a ^= rot(c, 4);  c += b; 
  b -= a;  b ^= rot(a, 6);  a += c; 
  c -= b;  c ^= rot(b, 8);  b += a; 
  a -= c;  a ^= rot(c,16);  c += b; 
  b -= a;  b ^= rot(a,19);  a += c; 
  c -= b;  c ^= rot(b, 4);  b += a; 
}

/*
-------------------------------------------------------------------------------
final -- final mixing of 3 32-bit values (a,b,c) into c

Pairs of (a,b,c) values differing in only a few bits will usually
produce values of c that look totally different.  This was tested for
* pairs that differed by one bit, by two bits, in any combination
  of top bits of (a,b,c), or in any combination of bottom bits of
  (a,b,c).
* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
  is commonly produced by subtraction) look like a single 1-bit
  difference.
* the base values were pseudorandom, all zero but one bit set, or
  all zero plus a counter that starts at zero.

These constants passed:
 14 11 25 16 4 14 24
 12 14 25 16 4 14 24
and these came close:
  4  8 15 26 3 22 24
 10  8 15 26 3 22 24
 11  8 15 26 3 22 24
-------------------------------------------------------------------------------
*/
#define final(a,b,c) 
{ 
  c ^= b; c -= rot(b,14); 
  a ^= c; a -= rot(c,11); 
  b ^= a; b -= rot(a,25); 
  c ^= b; c -= rot(b,16); 
  a ^= c; a -= rot(c,4);  
  b ^= a; b -= rot(a,14); 
  c ^= b; c -= rot(b,24); 
}

#if HASH_LITTLE_ENDIAN == 1
uint32_t jenkins_hash(
  const void *key,       /* the key to hash */
  size_t      length)    /* length of the key */
{
  uint32_t a,b,c;                                          /* internal state */
  union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */

  /* Set up the internal state */
  a = b = c = 0xdeadbeef + ((uint32_t)length) + 0;

  u.ptr = key;
  if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
    const uint32_t *k = key;                           /* read 32-bit chunks */
#ifdef VALGRIND
    const uint8_t  *k8;
#endif /* ifdef VALGRIND */

    /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
    while (length > 12)
    {
      a += k[0];
      b += k[1];
      c += k[2];
      mix(a,b,c);
      length -= 12;
      k += 3;
    }

    /*----------------------------- handle the last (probably partial) block */
    /*
     * "k[2]&0xffffff" actually reads beyond the end of the string, but
     * then masks off the part it's not allowed to read.  Because the
     * string is aligned, the masked-off tail is in the same word as the
     * rest of the string.  Every machine with memory protection I've seen
     * does it on word boundaries, so is OK with this.  But VALGRIND will
     * still catch it and complain.  The masking trick does make the hash
     * noticeably faster for short strings (like English words).
     */
#ifndef VALGRIND

    switch(length)
    {
    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
    case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
    case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
    case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
    case 8 : b+=k[1]; a+=k[0]; break;
    case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
    case 6 : b+=k[1]&0xffff; a+=k[0]; break;
    case 5 : b+=k[1]&0xff; a+=k[0]; break;
    case 4 : a+=k[0]; break;
    case 3 : a+=k[0]&0xffffff; break;
    case 2 : a+=k[0]&0xffff; break;
    case 1 : a+=k[0]&0xff; break;
    case 0 : return c;  /* zero length strings require no mixing */
    }

#else /* make valgrind happy */

    k8 = (const uint8_t *)k;
    switch(length)
    {
    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
    case 11: c+=((uint32_t)k8[10])<<16;  /* fall through */
    case 10: c+=((uint32_t)k8[9])<<8;    /* fall through */
    case 9 : c+=k8[8];                   /* fall through */
    case 8 : b+=k[1]; a+=k[0]; break;
    case 7 : b+=((uint32_t)k8[6])<<16;   /* fall through */
    case 6 : b+=((uint32_t)k8[5])<<8;    /* fall through */
    case 5 : b+=k8[4];                   /* fall through */
    case 4 : a+=k[0]; break;
    case 3 : a+=((uint32_t)k8[2])<<16;   /* fall through */
    case 2 : a+=((uint32_t)k8[1])<<8;    /* fall through */
    case 1 : a+=k8[0]; break;
    case 0 : return c;  /* zero length strings require no mixing */
    }

#endif /* !valgrind */

  } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
    const uint16_t *k = key;                           /* read 16-bit chunks */
    const uint8_t  *k8;

    /*--------------- all but last block: aligned reads and different mixing */
    while (length > 12)
    {
      a += k[0] + (((uint32_t)k[1])<<16);
      b += k[2] + (((uint32_t)k[3])<<16);
      c += k[4] + (((uint32_t)k[5])<<16);
      mix(a,b,c);
      length -= 12;
      k += 6;
    }

    /*----------------------------- handle the last (probably partial) block */
    k8 = (const uint8_t *)k;
    switch(length)
    {
    case 12: c+=k[4]+(((uint32_t)k[5])<<16);
             b+=k[2]+(((uint32_t)k[3])<<16);
             a+=k[0]+(((uint32_t)k[1])<<16);
             break;
    case 11: c+=((uint32_t)k8[10])<<16;     /* @fallthrough */
    case 10: c+=k[4];                       /* @fallthrough@ */
             b+=k[2]+(((uint32_t)k[3])<<16);
             a+=k[0]+(((uint32_t)k[1])<<16);
             break;
    case 9 : c+=k8[8];                      /* @fallthrough */
    case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
             a+=k[0]+(((uint32_t)k[1])<<16);
             break;
    case 7 : b+=((uint32_t)k8[6])<<16;      /* @fallthrough */
    case 6 : b+=k[2];
             a+=k[0]+(((uint32_t)k[1])<<16);
             break;
    case 5 : b+=k8[4];                      /* @fallthrough */
    case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
             break;
    case 3 : a+=((uint32_t)k8[2])<<16;      /* @fallthrough */
    case 2 : a+=k[0];
             break;
    case 1 : a+=k8[0];
             break;
    case 0 : return c;  /* zero length strings require no mixing */
    }

  } else {                        /* need to read the key one byte at a time */
    const uint8_t *k = key;

    /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
    while (length > 12)
    {
      a += k[0];
      a += ((uint32_t)k[1])<<8;
      a += ((uint32_t)k[2])<<16;
      a += ((uint32_t)k[3])<<24;
      b += k[4];
      b += ((uint32_t)k[5])<<8;
      b += ((uint32_t)k[6])<<16;
      b += ((uint32_t)k[7])<<24;
      c += k[8];
      c += ((uint32_t)k[9])<<8;
      c += ((uint32_t)k[10])<<16;
      c += ((uint32_t)k[11])<<24;
      mix(a,b,c);
      length -= 12;
      k += 12;
    }

    /*-------------------------------- last block: affect all 32 bits of (c) */
    switch(length)                   /* all the case statements fall through */
    {
    case 12: c+=((uint32_t)k[11])<<24;
    case 11: c+=((uint32_t)k[10])<<16;
    case 10: c+=((uint32_t)k[9])<<8;
    case 9 : c+=k[8];
    case 8 : b+=((uint32_t)k[7])<<24;
    case 7 : b+=((uint32_t)k[6])<<16;
    case 6 : b+=((uint32_t)k[5])<<8;
    case 5 : b+=k[4];
    case 4 : a+=((uint32_t)k[3])<<24;
    case 3 : a+=((uint32_t)k[2])<<16;
    case 2 : a+=((uint32_t)k[1])<<8;
    case 1 : a+=k[0];
             break;
    case 0 : return c;  /* zero length strings require no mixing */
    }
  }

  final(a,b,c);
  return c;             /* zero length strings require no mixing */
}

#elif HASH_BIG_ENDIAN == 1
/*
 * hashbig():
 * This is the same as hashword() on big-endian machines.  It is different
 * from hashlittle() on all machines.  hashbig() takes advantage of
 * big-endian byte ordering.
 */
uint32_t jenkins_hash( const void *key, size_t length)
{
  uint32_t a,b,c;
  union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */

  /* Set up the internal state */
  a = b = c = 0xdeadbeef + ((uint32_t)length) + 0;

  u.ptr = key;
  if (HASH_BIG_ENDIAN && ((u.i & 0x3) == 0)) {
    const uint32_t *k = key;                           /* read 32-bit chunks */
#ifdef VALGRIND
    const uint8_t  *k8;
#endif /* ifdef VALGRIND */

    /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
    while (length > 12)
    {
      a += k[0];
      b += k[1];
      c += k[2];
      mix(a,b,c);
      length -= 12;
      k += 3;
    }

    /*----------------------------- handle the last (probably partial) block */
    /*
     * "k[2]<<8" actually reads beyond the end of the string, but
     * then shifts out the part it's not allowed to read.  Because the
     * string is aligned, the illegal read is in the same word as the
     * rest of the string.  Every machine with memory protection I've seen
     * does it on word boundaries, so is OK with this.  But VALGRIND will
     * still catch it and complain.  The masking trick does make the hash
     * noticeably faster for short strings (like English words).
     */
#ifndef VALGRIND

    switch(length)
    {
    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
    case 11: c+=k[2]&0xffffff00; b+=k[1]; a+=k[0]; break;
    case 10: c+=k[2]&0xffff0000; b+=k[1]; a+=k[0]; break;
    case 9 : c+=k[2]&0xff000000; b+=k[1]; a+=k[0]; break;
    case 8 : b+=k[1]; a+=k[0]; break;
    case 7 : b+=k[1]&0xffffff00; a+=k[0]; break;
    case 6 : b+=k[1]&0xffff0000; a+=k[0]; break;
    case 5 : b+=k[1]&0xff000000; a+=k[0]; break;
    case 4 : a+=k[0]; break;
    case 3 : a+=k[0]&0xffffff00; break;
    case 2 : a+=k[0]&0xffff0000; break;
    case 1 : a+=k[0]&0xff000000; break;
    case 0 : return c;              /* zero length strings require no mixing */
    }

#else  /* make valgrind happy */

    k8 = (const uint8_t *)k;
    switch(length)                   /* all the case statements fall through */
    {
    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
    case 11: c+=((uint32_t)k8[10])<<8;  /* fall through */
    case 10: c+=((uint32_t)k8[9])<<16;  /* fall through */
    case 9 : c+=((uint32_t)k8[8])<<24;  /* fall through */
    case 8 : b+=k[1]; a+=k[0]; break;
    case 7 : b+=((uint32_t)k8[6])<<8;   /* fall through */
    case 6 : b+=((uint32_t)k8[5])<<16;  /* fall through */
    case 5 : b+=((uint32_t)k8[4])<<24;  /* fall through */
    case 4 : a+=k[0]; break;
    case 3 : a+=((uint32_t)k8[2])<<8;   /* fall through */
    case 2 : a+=((uint32_t)k8[1])<<16;  /* fall through */
    case 1 : a+=((uint32_t)k8[0])<<24; break;
    case 0 : return c;
    }

#endif /* !VALGRIND */

  } else {                        /* need to read the key one byte at a time */
    const uint8_t *k = key;

    /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
    while (length > 12)
    {
      a += ((uint32_t)k[0])<<24;
      a += ((uint32_t)k[1])<<16;
      a += ((uint32_t)k[2])<<8;
      a += ((uint32_t)k[3]);
      b += ((uint32_t)k[4])<<24;
      b += ((uint32_t)k[5])<<16;
      b += ((uint32_t)k[6])<<8;
      b += ((uint32_t)k[7]);
      c += ((uint32_t)k[8])<<24;
      c += ((uint32_t)k[9])<<16;
      c += ((uint32_t)k[10])<<8;
      c += ((uint32_t)k[11]);
      mix(a,b,c);
      length -= 12;
      k += 12;
    }

    /*-------------------------------- last block: affect all 32 bits of (c) */
    switch(length)                   /* all the case statements fall through */
    {
    case 12: c+=k[11];
    case 11: c+=((uint32_t)k[10])<<8;
    case 10: c+=((uint32_t)k[9])<<16;
    case 9 : c+=((uint32_t)k[8])<<24;
    case 8 : b+=k[7];
    case 7 : b+=((uint32_t)k[6])<<8;
    case 6 : b+=((uint32_t)k[5])<<16;
    case 5 : b+=((uint32_t)k[4])<<24;
    case 4 : a+=k[3];
    case 3 : a+=((uint32_t)k[2])<<8;
    case 2 : a+=((uint32_t)k[1])<<16;
    case 1 : a+=((uint32_t)k[0])<<24;
             break;
    case 0 : return c;
    }
  }

  final(a,b,c);
  return c;
}
#else /* HASH_XXX_ENDIAN == 1 */
#error Must define HASH_BIG_ENDIAN or HASH_LITTLE_ENDIAN
#endif /* HASH_XXX_ENDIAN == 1 */

View Code

murmur3_hash.h / murmur3_hash.c

murmur3_hash.h

1 // Platform-specific functions and macros
2 #include <stdint.h>
3 #include <stddef.h>
4 
5 uint32_t MurmurHash3_x86_32(const void *key, size_t length);

View Code

murmur3_hash.c

//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.

// Note - The x86 and x64 versions do _not_ produce the same results, as the
// algorithms are optimized for their respective platforms. You can still
// compile and run any of them on any platform, but your performance with the
// non-native version will be less than optimal.

#include "murmur3_hash.h"

//-----------------------------------------------------------------------------
// Platform-specific functions and macros

// Microsoft Visual Studio

#if defined(_MSC_VER)

#define FORCE_INLINE    __forceinline

#include <stdlib.h>

#define ROTL32(x,y)    _rotl(x,y)

#define BIG_CONSTANT(x) (x)

// Other compilers

#else    // defined(_MSC_VER)

#define    FORCE_INLINE inline __attribute__((always_inline))

static inline uint32_t rotl32 ( uint32_t x, int8_t r )
{
  return (x << r) | (x >> (32 - r));
}

#define    ROTL32(x,y)    rotl32(x,y)

#define BIG_CONSTANT(x) (x##LLU)

#endif // !defined(_MSC_VER)

//-----------------------------------------------------------------------------
// Block read - if your platform needs to do endian-swapping or can only
// handle aligned reads, do the conversion here

static FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
{
  return p[i];
}

//-----------------------------------------------------------------------------
// Finalization mix - force all bits of a hash block to avalanche

static FORCE_INLINE uint32_t fmix32 ( uint32_t h )
{
  h ^= h >> 16;
  h *= 0x85ebca6b;
  h ^= h >> 13;
  h *= 0xc2b2ae35;
  h ^= h >> 16;

  return h;
}

//-----------------------------------------------------------------------------

/* Definition modified slightly from the public domain interface (no seed +
 * return value */
uint32_t MurmurHash3_x86_32 ( const void * key, size_t length)
{
  const uint8_t * data = (const uint8_t*)key;
  const int nblocks = length / 4;

  uint32_t h1 = 0;

  uint32_t c1 = 0xcc9e2d51;
  uint32_t c2 = 0x1b873593;

  //----------
  // body

  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);

  for(int i = -nblocks; i; i++)
  {
    uint32_t k1 = getblock32(blocks,i);

    k1 *= c1;
    k1 = ROTL32(k1,15);
    k1 *= c2;

    h1 ^= k1;
    h1 = ROTL32(h1,13);
    h1 = h1*5+0xe6546b64;
  }

  //----------
  // tail

  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);

  uint32_t k1 = 0;

  switch(length & 3)
  {
  case 3: k1 ^= tail[2] << 16;
  case 2: k1 ^= tail[1] << 8;
  case 1: k1 ^= tail[0];
          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
  };

  //----------
  // finalization

  h1 ^= length;

  h1 = fmix32(h1);

  //*(uint32_t*)out = h1;
  return h1;
}

View Code

hash表在扩充的时候会去申请一个是原来2被大小的空间,然后启动一个线程 ( start_assoc_maintenance_thread ) 去同步原hash表中的item数据,如果在扩充的时候,有个请求item的请求,则会判断这个item是在新的hash表中还是原来的hash表中,然后再去各自的hash表的桶中查找。

这个扩充维护线程就在下面的源文件中，里面定义了一些扩充用的增删操作。

assoc.h / assoc.C

assoc.h

 1 /* slabs memory allocation */
 2 #ifndef SLABS_H
 3 #define SLABS_H
 4 
 5 
 6 /** Init the subsystem. 1st argument is the limit on no. of bytes to allocate,
 7     0 if no limit. 2nd argument is the growth factor; each slab will use a chunk
 8     size equal to the previous slab's chunk size times this factor.
 9     3rd argument specifies if the slab allocator should allocate all memory
10     up front (if true), or allocate memory in chunks as it is needed (if false)
11 */
12 void slabs_init(const size_t limit, const double factor, const bool prealloc);
13 
14 
15 /**
16  * Given object size, return id to use when allocating/freeing memory for object
17  * 0 means error: can't store such a large object
18  */
19 unsigned int slabs_clsid(const size_t size);
20 
21 
22 /** Allocate object of given length. 0 on error */ /*@null@*/
23 void *slabs_alloc(const size_t size, unsigned int id, unsigned int *total_chunks);
24 
25 
26 /** Free previously allocated object */
27 void slabs_free(void *ptr, size_t size, unsigned int id);
28 
29 
30 /** Adjust the stats for memory requested */
31 void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal);
32 
33 
34 /** Return a datum for stats in binary protocol */
35 bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c);
36 
37 
38 /** Fill buffer with stats */ /*@null@*/
39 void slabs_stats(ADD_STAT add_stats, void *c);
40 
41 
42 /* Hints as to freespace in slab class */
43 unsigned int slabs_available_chunks(unsigned int id, bool *mem_flag, unsigned int *total_chunks);
44 
45 
46 int start_slab_maintenance_thread(void);
47 void stop_slab_maintenance_thread(void);
48 
49 
50 enum reassign_result_type {
51     REASSIGN_OK=0, REASSIGN_RUNNING, REASSIGN_BADCLASS, REASSIGN_NOSPARE,
52     REASSIGN_SRC_DST_SAME
53 };
54 
55 enum reassign_result_type slabs_reassign(int src, int dst);
56 
57 void slabs_rebalancer_pause(void);
58 void slabs_rebalancer_resume(void);
59 
60 #endif

View Code

assoc.c

  1 /* -*- Mode: C; tab- 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2 /*
  3  * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
  4  * and are divided into chunks. The chunk sizes start off at the size of the
  5  * "item" structure plus space for a small key and value. They increase by
  6  * a multiplier factor from there, up to half the maximum slab size. The last
  7  * slab size is always 1MB, since that's the maximum item size allowed by the
  8  * memcached protocol.
  9  */
 10 #include "memcached.h"
 11 #include <sys/stat.h>
 12 #include <sys/socket.h>
 13 #include <sys/signal.h>
 14 #include <sys/resource.h>
 15 #include <fcntl.h>
 16 #include <netinet/in.h>
 17 #include <errno.h>
 18 #include <stdlib.h>
 19 #include <stdio.h>
 20 #include <string.h>
 21 #include <assert.h>
 22 #include <pthread.h>
 23 
 24 /* powers-of-N allocation structures */
 25 
 26 typedef struct
 27 {
 28     //slab分配器分配的item的大小
 29     unsigned int size;      /* sizes of items */
 30     //每一个slab分配器能分配多少个item
 31     unsigned int perslab;   /* how many items per slab */
 32 
 33     //指向空闲item链表
 34     void *slots;           /* list of item ptrs */
 35     //空闲item的个数
 36     unsigned int sl_curr;   /* total free items in list */
 37 
 38     //这个是已经分配了内存的slabs个数。list_size是这个slabs数组(slab_list)的大小
 39     //本slabclass_t可用的slab分配器个数
 40     unsigned int slabs;     /* how many slabs were allocated for this class */
 41 
 42     //slab数组，数组的每一个元素就是一个slab分配器，这些分配器都分配相同尺寸的内存
 43     void **slab_list;       /* array of slab pointers */
 44     //slab数组的大小, list_size >= slabs
 45     unsigned int list_size; /* size of prev array */
 46 
 47     //用于reassign，指明slabclass_t中的哪个块内存要被其他slabclass_t使用
 48     unsigned int killing;  /* index+1 of dying slab, or zero if none */
 49 
 50     //本slabclass_t分配出去的字节数
 51     size_t requested; /* The number of requested bytes */
 52 } slabclass_t;
 53 
 54 //#define MAX_NUMBER_OF_SLAB_CLASSES (63 + 1)
 55 static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
 56 static size_t mem_limit = 0;
 57 static size_t mem_malloced = 0;
 58 /* If the memory limit has been hit once. Used as a hint to decide when to
 59  * early-wake the LRU maintenance thread */
 60 static bool mem_limit_reached = false;
 61 static int power_largest;
 62 
 63 static void *mem_base = NULL;
 64 static void *mem_current = NULL;
 65 static size_t mem_avail = 0;
 66 
 67 /**
 68  * Access to the slab allocator is protected by this lock
 69  */
 70 static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
 71 static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
 72 
 73 /*
 74  * Forward Declarations
 75  */
 76 static int do_slabs_newslab(const unsigned int id);
 77 static void *memory_allocate(size_t size);
 78 static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
 79 
 80 /* Preallocate as many slab pages as possible (called from slabs_init)
 81    on start-up, so users don't get confused out-of-memory errors when
 82    they do have free (in-slab) space, but no space to make new slabs.
 83    if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
 84    slab types can be made.  if max memory is less than 18 MB, only the
 85    smaller ones will be made.  */
 86 static void slabs_preallocate (const unsigned int maxslabs);
 87 
 88 /*
 89  * Figures out which slab class (chunk size) is required to store an item of
 90  * a given size.
 91  *
 92  * Given object size, return id to use when allocating/freeing memory for object
 93  * 0 means error: can't store such a large object
 94  */
 95 
 96 unsigned int slabs_clsid(const size_t size) {
 97     int res = POWER_SMALLEST;
 98 
 99     if (size == 0)
100         return 0;
101     while (size > slabclass[res].size)
102         if (res++ == power_largest)     /* won't fit in the biggest slab */
103             return 0;
104     return res;
105 }
106 
107 /**
108  * Determines the chunk sizes and initializes the slab class descriptors
109  * accordingly.
110  */
111 void slabs_init(const size_t limit, const double factor, const bool prealloc)
112 {
113     int i = POWER_SMALLEST - 1;
114 
115 
116     //settings.chunk_size默认值为48，可以在启动memcached的时候通过-n选项设置
117     //size由两部分组成: item结构体本身 和 这个item对应的数据
118     //这里的数据也就是set、add命令中的那个数据.后面的循环可以看到这个size变量会
119     //根据扩容因子factor慢慢扩大，所以能存储的数据长度也会变大的
120     unsigned int size = sizeof(item) + settings.chunk_size;
121 
122     mem_limit = limit;//最大内存， 默认64M，最大2G。通过-m 设定
123 
124     //用户要求预分配一大块的内存，以后需要内存，就向这块内存申请。
125     if (prealloc)
126     {
127         /* Allocate everything in a big chunk with malloc */
128         mem_base = malloc(mem_limit);
129         if (mem_base != NULL)
130         {
131             mem_current = mem_base;
132             mem_avail = mem_limit;
133         }
134         else
135         {
136             fprintf(stderr, "Warning: Failed to allocate requested memory in"
137                     " one large chunk.
Will allocate in smaller chunks
");
138         }
139     }
140 
141     //初始化数组，这个操作很重要，数组中所有元素的成员变量值都为0了
142     memset(slabclass, 0, sizeof(slabclass));
143 
144    //slabclass数组中的第一个元素并不使用
145    //settings.item_size_max是memcached支持的最大item尺寸，默认为1M(也就是网上
146    //所说的memcached存储的数据最大为1MB)。
147    //item核心分配算法
148    //item_size_max : 单个item最大字节数， 默认1M
149    //factor : 增长因子，默认1.25
150     while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1 && size <= settings.item_size_max / factor)
151     {
152         /* Make sure items are always n-byte aligned */
153         //确保size为CHUNK_ALIGN_BYTES的倍数，不够则向补足
154         if (size % CHUNK_ALIGN_BYTES)
155         {
156             size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
157         }
158 
159         slabclass[i].size = size;
160         slabclass[i].perslab = settings.item_size_max / slabclass[i].size;//记录每个slab中item的个数
161 
162         size *= factor;//每次循环size的大小都增加factor倍
163 
164         if (settings.verbose > 1)
165         {
166             fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u
", i, slabclass[i].size, slabclass[i].perslab);
167         }
168     }
169 
170     //补足一块大小为item_size_max的块
171     power_largest = i;
172     slabclass[power_largest].size = settings.item_size_max;
173     slabclass[power_largest].perslab = 1;
174     if (settings.verbose > 1) 
175     {
176         fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u
", i, slabclass[i].size, slabclass[i].perslab);
177     }
178 
179     /* for the test suite:  faking of how much we've already malloc'd */
180     {
181         char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
182         if (t_initial_malloc)
183         {
184             mem_malloced = (size_t)atol(t_initial_malloc);
185         }
186 
187     }
188 
189     if (prealloc)
190     {
191         slabs_preallocate(power_largest);
192     }
193 }
194 
195 static void slabs_preallocate (const unsigned int maxslabs) {
196     int i;
197     unsigned int prealloc = 0;
198 
199     /* pre-allocate a 1MB slab in every size class so people don't get
200        confused by non-intuitive "SERVER_ERROR out of memory"
201        messages.  this is the most common question on the mailing
202        list.  if you really don't want this, you can rebuild without
203        these three lines.  */
204 
205     for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
206         if (++prealloc > maxslabs)
207             return;
208         if (do_slabs_newslab(i) == 0) {
209             fprintf(stderr, "Error while preallocating slab memory!
"
210                 "If using -L or other prealloc options, max memory must be "
211                 "at least %d megabytes.
", power_largest);
212             exit(1);
213         }
214     }
215 
216 }
217 
218 static int grow_slab_list (const unsigned int id) {
219     slabclass_t *p = &slabclass[id];
220     if (p->slabs == p->list_size) {
221         size_t new_size =  (p->list_size != 0) ? p->list_size * 2 : 16;
222         void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
223         if (new_list == 0) return 0;
224         p->list_size = new_size;
225         p->slab_list = new_list;
226     }
227     return 1;
228 }
229 
230 static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
231     slabclass_t *p = &slabclass[id];
232     int x;
233     for (x = 0; x < p->perslab; x++) {
234         do_slabs_free(ptr, 0, id);
235         ptr += p->size;
236     }
237 }
238 
239 static int do_slabs_newslab(const unsigned int id) {
240     slabclass_t *p = &slabclass[id];
241     int len = settings.slab_reassign ? settings.item_size_max
242         : p->size * p->perslab;
243     char *ptr;
244 
245     if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0)) {
246         mem_limit_reached = true;
247         MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
248         return 0;
249     }
250 
251     if ((grow_slab_list(id) == 0) ||
252         ((ptr = memory_allocate((size_t)len)) == 0)) {
253 
254         MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
255         return 0;
256     }
257 
258     memset(ptr, 0, (size_t)len);
259     split_slab_page_into_freelist(ptr, id);
260 
261     p->slab_list[p->slabs++] = ptr;
262     mem_malloced += len;
263     MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
264 
265     return 1;
266 }
267 
268 /*@null@*/
269 static void *do_slabs_alloc(const size_t size, unsigned int id, unsigned int *total_chunks) {
270     slabclass_t *p;
271     void *ret = NULL;
272     item *it = NULL;
273 
274     if (id < POWER_SMALLEST || id > power_largest) {
275         MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
276         return NULL;
277     }
278     p = &slabclass[id];
279     assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
280 
281     *total_chunks = p->slabs * p->perslab;
282     /* fail unless we have space at the end of a recently allocated page,
283        we have something on our freelist, or we could allocate a new page */
284     if (! (p->sl_curr != 0 || do_slabs_newslab(id) != 0)) {
285         /* We don't have more memory available */
286         ret = NULL;
287     } else if (p->sl_curr != 0) {
288         /* return off our freelist */
289         it = (item *)p->slots;
290         p->slots = it->next;
291         if (it->next) it->next->prev = 0;
292         /* Kill flag and initialize refcount here for lock safety in slab
293          * mover's freeness detection. */
294         it->it_flags &= ~ITEM_SLABBED;
295         it->refcount = 1;
296         p->sl_curr--;
297         ret = (void *)it;
298     }
299 
300     if (ret) {
301         p->requested += size;
302         MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
303     } else {
304         MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
305     }
306 
307     return ret;
308 }
309 
310 static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
311     slabclass_t *p;
312     item *it;
313 
314     assert(id >= POWER_SMALLEST && id <= power_largest);
315     if (id < POWER_SMALLEST || id > power_largest)
316         return;
317 
318     MEMCACHED_SLABS_FREE(size, id, ptr);
319     p = &slabclass[id];
320 
321     it = (item *)ptr;
322     it->it_flags |= ITEM_SLABBED;
323     it->slabs_clsid = 0;
324     it->prev = 0;
325     it->next = p->slots;
326     if (it->next) it->next->prev = it;
327     p->slots = it;
328 
329     p->sl_curr++;
330     p->requested -= size;
331     return;
332 }
333 
334 static int nz_strcmp(int nzlength, const char *nz, const char *z) {
335     int zlength=strlen(z);
336     return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
337 }
338 
339 bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
340     bool ret = true;
341 
342     if (add_stats != NULL) {
343         if (!stat_type) {
344             /* prepare general statistics for the engine */
345             STATS_LOCK();
346             APPEND_STAT("bytes", "%llu", (unsigned long long)stats.curr_bytes);
347             APPEND_STAT("curr_items", "%u", stats.curr_items);
348             APPEND_STAT("total_items", "%u", stats.total_items);
349             STATS_UNLOCK();
350             item_stats_totals(add_stats, c);
351         } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
352             item_stats(add_stats, c);
353         } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
354             slabs_stats(add_stats, c);
355         } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
356             item_stats_sizes(add_stats, c);
357         } else {
358             ret = false;
359         }
360     } else {
361         ret = false;
362     }
363 
364     return ret;
365 }
366 
367 /*@null@*/
368 static void do_slabs_stats(ADD_STAT add_stats, void *c) {
369     int i, total;
370     /* Get the per-thread stats which contain some interesting aggregates */
371     struct thread_stats thread_stats;
372     threadlocal_stats_aggregate(&thread_stats);
373 
374     total = 0;
375     for(i = POWER_SMALLEST; i <= power_largest; i++) {
376         slabclass_t *p = &slabclass[i];
377         if (p->slabs != 0) {
378             uint32_t perslab, slabs;
379             slabs = p->slabs;
380             perslab = p->perslab;
381 
382             char key_str[STAT_KEY_LEN];
383             char val_str[STAT_VAL_LEN];
384             int klen = 0, vlen = 0;
385 
386             APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
387             APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
388             APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
389             APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
390             APPEND_NUM_STAT(i, "used_chunks", "%u",
391                             slabs*perslab - p->sl_curr);
392             APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
393             /* Stat is dead, but displaying zero instead of removing it. */
394             APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
395             APPEND_NUM_STAT(i, "mem_requested", "%llu",
396                             (unsigned long long)p->requested);
397             APPEND_NUM_STAT(i, "get_hits", "%llu",
398                     (unsigned long long)thread_stats.slab_stats[i].get_hits);
399             APPEND_NUM_STAT(i, "cmd_set", "%llu",
400                     (unsigned long long)thread_stats.slab_stats[i].set_cmds);
401             APPEND_NUM_STAT(i, "delete_hits", "%llu",
402                     (unsigned long long)thread_stats.slab_stats[i].delete_hits);
403             APPEND_NUM_STAT(i, "incr_hits", "%llu",
404                     (unsigned long long)thread_stats.slab_stats[i].incr_hits);
405             APPEND_NUM_STAT(i, "decr_hits", "%llu",
406                     (unsigned long long)thread_stats.slab_stats[i].decr_hits);
407             APPEND_NUM_STAT(i, "cas_hits", "%llu",
408                     (unsigned long long)thread_stats.slab_stats[i].cas_hits);
409             APPEND_NUM_STAT(i, "cas_badval", "%llu",
410                     (unsigned long long)thread_stats.slab_stats[i].cas_badval);
411             APPEND_NUM_STAT(i, "touch_hits", "%llu",
412                     (unsigned long long)thread_stats.slab_stats[i].touch_hits);
413             total++;
414         }
415     }
416 
417     /* add overall slab stats and append terminator */
418 
419     APPEND_STAT("active_slabs", "%d", total);
420     APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
421     add_stats(NULL, 0, NULL, 0, c);
422 }
423 
424 static void *memory_allocate(size_t size) {
425     void *ret;
426 
427     if (mem_base == NULL) {
428         /* We are not using a preallocated large memory chunk */
429         ret = malloc(size);
430     } else {
431         ret = mem_current;
432 
433         if (size > mem_avail) {
434             return NULL;
435         }
436 
437         /* mem_current pointer _must_ be aligned!!! */
438         if (size % CHUNK_ALIGN_BYTES) {
439             size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
440         }
441 
442         mem_current = ((char*)mem_current) + size;
443         if (size < mem_avail) {
444             mem_avail -= size;
445         } else {
446             mem_avail = 0;
447         }
448     }
449 
450     return ret;
451 }
452 
453 void *slabs_alloc(size_t size, unsigned int id, unsigned int *total_chunks) {
454     void *ret;
455 
456     pthread_mutex_lock(&slabs_lock);
457     ret = do_slabs_alloc(size, id, total_chunks);
458     pthread_mutex_unlock(&slabs_lock);
459     return ret;
460 }
461 
462 void slabs_free(void *ptr, size_t size, unsigned int id) {
463     pthread_mutex_lock(&slabs_lock);
464     do_slabs_free(ptr, size, id);
465     pthread_mutex_unlock(&slabs_lock);
466 }
467 
468 void slabs_stats(ADD_STAT add_stats, void *c) {
469     pthread_mutex_lock(&slabs_lock);
470     do_slabs_stats(add_stats, c);
471     pthread_mutex_unlock(&slabs_lock);
472 }
473 
474 void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
475 {
476     pthread_mutex_lock(&slabs_lock);
477     slabclass_t *p;
478     if (id < POWER_SMALLEST || id > power_largest) {
479         fprintf(stderr, "Internal error! Invalid slab class
");
480         abort();
481     }
482 
483     p = &slabclass[id];
484     p->requested = p->requested - old + ntotal;
485     pthread_mutex_unlock(&slabs_lock);
486 }
487 
488 unsigned int slabs_available_chunks(const unsigned int id, bool *mem_flag,
489         unsigned int *total_chunks) {
490     unsigned int ret;
491     slabclass_t *p;
492 
493     pthread_mutex_lock(&slabs_lock);
494     p = &slabclass[id];
495     ret = p->sl_curr;
496     if (mem_flag != NULL)
497         *mem_flag = mem_limit_reached;
498     if (total_chunks != NULL)
499         *total_chunks = p->slabs * p->perslab;
500     pthread_mutex_unlock(&slabs_lock);
501     return ret;
502 }
503 
504 static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
505 static volatile int do_run_slab_thread = 1;
506 static volatile int do_run_slab_rebalance_thread = 1;
507 
508 #define DEFAULT_SLAB_BULK_CHECK 1
509 int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
510 
511 static int slab_rebalance_start(void) {
512     slabclass_t *s_cls;
513     int no_go = 0;
514 
515     pthread_mutex_lock(&slabs_lock);
516 
517     if (slab_rebal.s_clsid < POWER_SMALLEST ||
518         slab_rebal.s_clsid > power_largest  ||
519         slab_rebal.d_clsid < POWER_SMALLEST ||
520         slab_rebal.d_clsid > power_largest  ||
521         slab_rebal.s_clsid == slab_rebal.d_clsid)
522         no_go = -2;
523 
524     s_cls = &slabclass[slab_rebal.s_clsid];
525 
526     if (!grow_slab_list(slab_rebal.d_clsid)) {
527         no_go = -1;
528     }
529 
530     if (s_cls->slabs < 2)
531         no_go = -3;
532 
533     if (no_go != 0) {
534         pthread_mutex_unlock(&slabs_lock);
535         return no_go; /* Should use a wrapper function... */
536     }
537 
538     s_cls->killing = 1;
539 
540     slab_rebal.slab_start = s_cls->slab_list[s_cls->killing - 1];
541     slab_rebal.slab_end   = (char *)slab_rebal.slab_start +
542         (s_cls->size * s_cls->perslab);
543     slab_rebal.slab_pos   = slab_rebal.slab_start;
544     slab_rebal.done       = 0;
545 
546     /* Also tells do_item_get to search for items in this slab */
547     slab_rebalance_signal = 2;
548 
549     if (settings.verbose > 1) {
550         fprintf(stderr, "Started a slab rebalance
");
551     }
552 
553     pthread_mutex_unlock(&slabs_lock);
554 
555     STATS_LOCK();
556     stats.slab_reassign_running = true;
557     STATS_UNLOCK();
558 
559     return 0;
560 }
561 
562 enum move_status {
563     MOVE_PASS=0, MOVE_FROM_SLAB, MOVE_FROM_LRU, MOVE_BUSY, MOVE_LOCKED
564 };
565 
566 /* refcount == 0 is safe since nobody can incr while item_lock is held.
567  * refcount != 0 is impossible since flags/etc can be modified in other
568  * threads. instead, note we found a busy one and bail. logic in do_item_get
569  * will prevent busy items from continuing to be busy
570  * NOTE: This is checking it_flags outside of an item lock. I believe this
571  * works since it_flags is 8 bits, and we're only ever comparing a single bit
572  * regardless. ITEM_SLABBED bit will always be correct since we're holding the
573  * lock which modifies that bit. ITEM_LINKED won't exist if we're between an
574  * item having ITEM_SLABBED removed, and the key hasn't been added to the item
575  * yet. The memory barrier from the slabs lock should order the key write and the
576  * flags to the item?
577  * If ITEM_LINKED did exist and was just removed, but we still see it, that's
578  * still safe since it will have a valid key, which we then lock, and then
579  * recheck everything.
580  * This may not be safe on all platforms; If not, slabs_alloc() will need to
581  * seed the item key while holding slabs_lock.
582  */
583 static int slab_rebalance_move(void) {
584     slabclass_t *s_cls;
585     int x;
586     int was_busy = 0;
587     int refcount = 0;
588     uint32_t hv;
589     void *hold_lock;
590     enum move_status status = MOVE_PASS;
591 
592     pthread_mutex_lock(&slabs_lock);
593 
594     s_cls = &slabclass[slab_rebal.s_clsid];
595 
596     for (x = 0; x < slab_bulk_check; x++) {
597         hv = 0;
598         hold_lock = NULL;
599         item *it = slab_rebal.slab_pos;
600         status = MOVE_PASS;
601         if (it->slabs_clsid != 255) {
602             /* ITEM_SLABBED can only be added/removed under the slabs_lock */
603             if (it->it_flags & ITEM_SLABBED) {
604                 /* remove from slab freelist */
605                 if (s_cls->slots == it) {
606                     s_cls->slots = it->next;
607                 }
608                 if (it->next) it->next->prev = it->prev;
609                 if (it->prev) it->prev->next = it->next;
610                 s_cls->sl_curr--;
611                 status = MOVE_FROM_SLAB;
612             } else if ((it->it_flags & ITEM_LINKED) != 0) {
613                 /* If it doesn't have ITEM_SLABBED, the item could be in any
614                  * state on its way to being freed or written to. If no
615                  * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active
616                  * and have the key written to it already.
617                  */
618                 hv = hash(ITEM_key(it), it->nkey);
619                 if ((hold_lock = item_trylock(hv)) == NULL) {
620                     status = MOVE_LOCKED;
621                 } else {
622                     refcount = refcount_incr(&it->refcount);
623                     if (refcount == 2) { /* item is linked but not busy */
624                         /* Double check ITEM_LINKED flag here, since we're
625                          * past a memory barrier from the mutex. */
626                         if ((it->it_flags & ITEM_LINKED) != 0) {
627                             status = MOVE_FROM_LRU;
628                         } else {
629                             /* refcount == 1 + !ITEM_LINKED means the item is being
630                              * uploaded to, or was just unlinked but hasn't been freed
631                              * yet. Let it bleed off on its own and try again later */
632                             status = MOVE_BUSY;
633                         }
634                     } else {
635                         if (settings.verbose > 2) {
636                             fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)
",
637                                 it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
638                         }
639                         status = MOVE_BUSY;
640                     }
641                     /* Item lock must be held while modifying refcount */
642                     if (status == MOVE_BUSY) {
643                         refcount_decr(&it->refcount);
644                         item_trylock_unlock(hold_lock);
645                     }
646                 }
647             }
648         }
649 
650         switch (status) {
651             case MOVE_FROM_LRU:
652                 /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock.
653                  * We only need to hold the slabs_lock while initially looking
654                  * at an item, and at this point we have an exclusive refcount
655                  * (2) + the item is locked. Drop slabs lock, drop item to
656                  * refcount 1 (just our own, then fall through and wipe it
657                  */
658                 pthread_mutex_unlock(&slabs_lock);
659                 do_item_unlink(it, hv);
660                 item_trylock_unlock(hold_lock);
661                 pthread_mutex_lock(&slabs_lock);
662             case MOVE_FROM_SLAB:
663                 it->refcount = 0;
664                 it->it_flags = 0;
665                 it->slabs_clsid = 255;
666                 break;
667             case MOVE_BUSY:
668             case MOVE_LOCKED:
669                 slab_rebal.busy_items++;
670                 was_busy++;
671                 break;
672             case MOVE_PASS:
673                 break;
674         }
675 
676         slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
677         if (slab_rebal.slab_pos >= slab_rebal.slab_end)
678             break;
679     }
680 
681     if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
682         /* Some items were busy, start again from the top */
683         if (slab_rebal.busy_items) {
684             slab_rebal.slab_pos = slab_rebal.slab_start;
685             slab_rebal.busy_items = 0;
686         } else {
687             slab_rebal.done++;
688         }
689     }
690 
691     pthread_mutex_unlock(&slabs_lock);
692 
693     return was_busy;
694 }
695 
696 static void slab_rebalance_finish(void) {
697     slabclass_t *s_cls;
698     slabclass_t *d_cls;
699 
700     pthread_mutex_lock(&slabs_lock);
701 
702     s_cls = &slabclass[slab_rebal.s_clsid];
703     d_cls   = &slabclass[slab_rebal.d_clsid];
704 
705     /* At this point the stolen slab is completely clear */
706     s_cls->slab_list[s_cls->killing - 1] =
707         s_cls->slab_list[s_cls->slabs - 1];
708     s_cls->slabs--;
709     s_cls->killing = 0;
710 
711     memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
712 
713     d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
714     split_slab_page_into_freelist(slab_rebal.slab_start,
715         slab_rebal.d_clsid);
716 
717     slab_rebal.done       = 0;
718     slab_rebal.s_clsid    = 0;
719     slab_rebal.d_clsid    = 0;
720     slab_rebal.slab_start = NULL;
721     slab_rebal.slab_end   = NULL;
722     slab_rebal.slab_pos   = NULL;
723 
724     slab_rebalance_signal = 0;
725 
726     pthread_mutex_unlock(&slabs_lock);
727 
728     STATS_LOCK();
729     stats.slab_reassign_running = false;
730     stats.slabs_moved++;
731     STATS_UNLOCK();
732 
733     if (settings.verbose > 1) {
734         fprintf(stderr, "finished a slab move
");
735     }
736 }
737 
738 /* Return 1 means a decision was reached.
739  * Move to its own thread (created/destroyed as needed) once automover is more
740  * complex.
741  */
742 static int slab_automove_decision(int *src, int *dst) {
743     static uint64_t evicted_old[MAX_NUMBER_OF_SLAB_CLASSES];
744     static unsigned int slab_zeroes[MAX_NUMBER_OF_SLAB_CLASSES];
745     static unsigned int slab_winner = 0;
746     static unsigned int slab_wins   = 0;
747     uint64_t evicted_new[MAX_NUMBER_OF_SLAB_CLASSES];
748     uint64_t evicted_diff = 0;
749     uint64_t evicted_max  = 0;
750     unsigned int highest_slab = 0;
751     unsigned int total_pages[MAX_NUMBER_OF_SLAB_CLASSES];
752     int i;
753     int source = 0;
754     int dest = 0;
755     static rel_time_t next_run;
756 
757     /* Run less frequently than the slabmove tester. */
758     if (current_time >= next_run) {
759         next_run = current_time + 10;
760     } else {
761         return 0;
762     }
763 
764     item_stats_evictions(evicted_new);
765     pthread_mutex_lock(&slabs_lock);
766     for (i = POWER_SMALLEST; i < power_largest; i++) {
767         total_pages[i] = slabclass[i].slabs;
768     }
769     pthread_mutex_unlock(&slabs_lock);
770 
771     /* Find a candidate source; something with zero evicts 3+ times */
772     for (i = POWER_SMALLEST; i < power_largest; i++) {
773         evicted_diff = evicted_new[i] - evicted_old[i];
774         if (evicted_diff == 0 && total_pages[i] > 2) {
775             slab_zeroes[i]++;
776             if (source == 0 && slab_zeroes[i] >= 3)
777                 source = i;
778         } else {
779             slab_zeroes[i] = 0;
780             if (evicted_diff > evicted_max) {
781                 evicted_max = evicted_diff;
782                 highest_slab = i;
783             }
784         }
785         evicted_old[i] = evicted_new[i];
786     }
787 
788     /* Pick a valid destination */
789     if (slab_winner != 0 && slab_winner == highest_slab) {
790         slab_wins++;
791         if (slab_wins >= 3)
792             dest = slab_winner;
793     } else {
794         slab_wins = 1;
795         slab_winner = highest_slab;
796     }
797 
798     if (source && dest) {
799         *src = source;
800         *dst = dest;
801         return 1;
802     }
803     return 0;
804 }
805 
806 /* Slab rebalancer thread.
807  * Does not use spinlocks since it is not timing sensitive. Burn less CPU and
808  * go to sleep if locks are contended
809  */
810 static void *slab_maintenance_thread(void *arg) {
811     int src, dest;
812 
813     while (do_run_slab_thread) {
814         if (settings.slab_automove == 1) {
815             if (slab_automove_decision(&src, &dest) == 1) {
816                 /* Blind to the return codes. It will retry on its own */
817                 slabs_reassign(src, dest);
818             }
819             sleep(1);
820         } else {
821             /* Don't wake as often if we're not enabled.
822              * This is lazier than setting up a condition right now. */
823             sleep(5);
824         }
825     }
826     return NULL;
827 }
828 
829 /* Slab mover thread.
830  * Sits waiting for a condition to jump off and shovel some memory about
831  */
832 static void *slab_rebalance_thread(void *arg) {
833     int was_busy = 0;
834     /* So we first pass into cond_wait with the mutex held */
835     mutex_lock(&slabs_rebalance_lock);
836 
837     while (do_run_slab_rebalance_thread) {
838         if (slab_rebalance_signal == 1) {
839             if (slab_rebalance_start() < 0) {
840                 /* Handle errors with more specifity as required. */
841                 slab_rebalance_signal = 0;
842             }
843 
844             was_busy = 0;
845         } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
846             was_busy = slab_rebalance_move();
847         }
848 
849         if (slab_rebal.done) {
850             slab_rebalance_finish();
851         } else if (was_busy) {
852             /* Stuck waiting for some items to unlock, so slow down a bit
853              * to give them a chance to free up */
854             usleep(50);
855         }
856 
857         if (slab_rebalance_signal == 0) {
858             /* always hold this lock while we're running */
859             pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
860         }
861     }
862     return NULL;
863 }
864 
865 /* Iterate at most once through the slab classes and pick a "random" source.
866  * I like this better than calling rand() since rand() is slow enough that we
867  * can just check all of the classes once instead.
868  */
869 static int slabs_reassign_pick_any(int dst) {
870     static int cur = POWER_SMALLEST - 1;
871     int tries = power_largest - POWER_SMALLEST + 1;
872     for (; tries > 0; tries--) {
873         cur++;
874         if (cur > power_largest)
875             cur = POWER_SMALLEST;
876         if (cur == dst)
877             continue;
878         if (slabclass[cur].slabs > 1) {
879             return cur;
880         }
881     }
882     return -1;
883 }
884 
885 static enum reassign_result_type do_slabs_reassign(int src, int dst) {
886     if (slab_rebalance_signal != 0)
887         return REASSIGN_RUNNING;
888 
889     if (src == dst)
890         return REASSIGN_SRC_DST_SAME;
891 
892     /* Special indicator to choose ourselves. */
893     if (src == -1) {
894         src = slabs_reassign_pick_any(dst);
895         /* TODO: If we end up back at -1, return a new error type */
896     }
897 
898     if (src < POWER_SMALLEST || src > power_largest ||
899         dst < POWER_SMALLEST || dst > power_largest)
900         return REASSIGN_BADCLASS;
901 
902     if (slabclass[src].slabs < 2)
903         return REASSIGN_NOSPARE;
904 
905     slab_rebal.s_clsid = src;
906     slab_rebal.d_clsid = dst;
907 
908     slab_rebalance_signal = 1;
909     pthread_cond_signal(&slab_rebalance_cond);
910 
911     return REASSIGN_OK;
912 }
913 
914 enum reassign_result_type slabs_reassign(int src, int dst) {
915     enum reassign_result_type ret;
916     if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
917         return REASSIGN_RUNNING;
918     }
919     ret = do_slabs_reassign(src, dst);
920     pthread_mutex_unlock(&slabs_rebalance_lock);
921     return ret;
922 }
923 
924 /* If we hold this lock, rebalancer can't wake up or move */
925 void slabs_rebalancer_pause(void) {
926     pthread_mutex_lock(&slabs_rebalance_lock);
927 }
928 
929 void slabs_rebalancer_resume(void) {
930     pthread_mutex_unlock(&slabs_rebalance_lock);
931 }
932 
933 static pthread_t maintenance_tid;
934 static pthread_t rebalance_tid;
935 
936 int start_slab_maintenance_thread(void) {
937     int ret;
938     slab_rebalance_signal = 0;
939     slab_rebal.slab_start = NULL;
940     char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
941     if (env != NULL) {
942         slab_bulk_check = atoi(env);
943         if (slab_bulk_check == 0) {
944             slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
945         }
946     }
947 
948     if (pthread_cond_init(&slab_rebalance_cond, NULL) != 0) {
949         fprintf(stderr, "Can't intiialize rebalance condition
");
950         return -1;
951     }
952     pthread_mutex_init(&slabs_rebalance_lock, NULL);
953 
954     if ((ret = pthread_create(&maintenance_tid, NULL,
955                               slab_maintenance_thread, NULL)) != 0) {
956         fprintf(stderr, "Can't create slab maint thread: %s
", strerror(ret));
957         return -1;
958     }
959     if ((ret = pthread_create(&rebalance_tid, NULL,
960                               slab_rebalance_thread, NULL)) != 0) {
961         fprintf(stderr, "Can't create rebal thread: %s
", strerror(ret));
962         return -1;
963     }
964     return 0;
965 }
966 
967 /* The maintenance thread is on a sleep/loop cycle, so it should join after a
968  * short wait */
969 void stop_slab_maintenance_thread(void) {
970     mutex_lock(&slabs_rebalance_lock);
971     do_run_slab_thread = 0;
972     do_run_slab_rebalance_thread = 0;
973     pthread_cond_signal(&slab_rebalance_cond);
974     pthread_mutex_unlock(&slabs_rebalance_lock);
975 
976     /* Wait for the maintenance thread to stop */
977     pthread_join(maintenance_tid, NULL);
978     pthread_join(rebalance_tid, NULL);
979 }

View Code