Linux中的线程（NPTL）与内存泄漏

线程不错，随手写几行，程序就并行了、多核了。在写服务器的时候，很多时候并发不高或者很小的情况下，用每个连接对应一个线程的方法可以最方便的处理并发。但是，小心，pthread_create的时候如果没有加上detach属性，或者在线程中没有自己detach，而又没有其他线程来join这个线程的时候，这个线程的栈就不会被释放（就像进程结束了没人来wait一样），从而造成严重的内存泄漏，服务器伤不起啊。

这不是吓唬你，我们到Linux下线程的实现源代码中找真相。Linux下的线程是在glibc中实现的，现在默认都是NPTL(Native POSIX Threads Library)了，我们就瞄一下NPTL的代码，还原内存泄漏的真相。

先看创建线程（在glibc\nptl\sysdeps\pthread\createthread.c和glibc\nptl\pthread_create.c中），上删减版。

先是pthread_create.c中：

int
__pthread_create_2_1 (newthread, attr, start_routine, arg)
     pthread_t *newthread;
     const pthread_attr_t *attr;
     void *(*start_routine) (void *);
     void *arg;
{
  STACK_VARIABLES;
 
  const struct pthread_attr *iattr = (struct pthread_attr *) attr;
  if (iattr == NULL)
    /* Is this the best idea?  On NUMA machines this could mean
       accessing far-away memory.  */
    iattr = &default_attr;
 
  struct pthread *pd = NULL;
  int err = ALLOCATE_STACK (iattr, &pd);
  if (__builtin_expect (err != 0, 0))
    /* Something went wrong.  Maybe a parameter of the attributes is
       invalid or we could not allocate memory.  */
    return err;
 
  /* Initialize the TCB.  All initializations with zero should be
     performed in 'get_cached_stack'.  This way we avoid doing this if
     the stack freshly allocated with 'mmap'.  */
 
#ifdef TLS_TCB_AT_TP
  /* Reference to the TCB itself.  */
  pd->header.self = pd;
 
  /* Self-reference for TLS.  */
  pd->header.tcb = pd;
#endif
 
  /* Store the address of the start routine and the parameter.  Since
     we do not start the function directly the stillborn thread will
     get the information from its thread descriptor.  */
  pd->start_routine = start_routine;
  pd->arg = arg;
 
  /* Copy the thread attribute flags.  */
  struct pthread *self = THREAD_SELF;
  pd->flags = ((iattr->flags & ~(ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
           | (self->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)));
 
  /* Initialize the field for the ID of the thread which is waiting
     for us.  This is a self-reference in case the thread is created
     detached.  */
  pd->joinid = iattr->flags & ATTR_FLAG_DETACHSTATE ? pd : NULL;
 
  /* The debug events are inherited from the parent.  */
  pd->eventbuf = self->eventbuf;
 
  /* Copy the parent's scheduling parameters.  The flags will say what
     is valid and what is not.  */
  pd->schedpolicy = self->schedpolicy;
  pd->schedparam = self->schedparam;
 
  /* Pass the descriptor to the caller.  */
  *newthread = (pthread_t) pd;
 
  /* Start the thread.  */
  return create_thread (pd, iattr, STACK_VARIABLES_ARGS);
}

在行分配了栈空间，然后初始化了TCB(Thread Control Block)。最后调用create_thread继续创建线程。

再看createthread.c中：

static int
create_thread (struct pthread *pd, const struct pthread_attr *attr,
           STACK_VARIABLES_PARMS)
{
  int clone_flags = (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGNAL
             | CLONE_SETTLS | CLONE_PARENT_SETTID
             | CLONE_CHILD_CLEARTID | CLONE_SYSVSEM
#if __ASSUME_NO_CLONE_DETACHED == 0
             | CLONE_DETACHED
#endif
             | 0);
 
  if (__builtin_expect (THREAD_GETMEM (THREAD_SELF, report_events), 0))
    {
      const int _idx = __td_eventword (TD_CREATE);
      const uint32_t _mask = __td_eventmask (TD_CREATE);
 
      if ((_mask & (__nptl_threads_events.event_bits[_idx]
            | pd->eventbuf.eventmask.event_bits[_idx])) != 0)
    {
      pd->stopped_start = true;
      int res = do_clone (pd, attr, clone_flags, start_thread,
                  STACK_VARIABLES_ARGS, 1);
      if (res == 0)
        {
          do
        pd->nextevent = __nptl_last_event;
          while (atomic_compare_and_exchange_bool_acq (&__nptl_last_event,
                               pd, pd->nextevent)
             != 0);
          lll_unlock (pd->lock, LLL_PRIVATE);
        }
 
      return res;
    }
    }
 
  bool stopped = false;
  if (attr != NULL && (attr->cpuset != NULL
               || (attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0))
    stopped = true;
  pd->stopped_start = stopped;
  pd->parent_cancelhandling = THREAD_GETMEM (THREAD_SELF, cancelhandling);
 
  /* Actually create the thread.  */
  int res = do_clone (pd, attr, clone_flags, start_thread,
              STACK_VARIABLES_ARGS, stopped);
 
  return res;
}

看到了，基本上就是设置了一下clone的标志位（最重要的就是共享内存和文件描述符的标志位）和入口函数：start_thread。从这里我们也可以看出来，Linux下面，线程就是共享内存的两个进程而已。

下面到pthread_create.c中看看start_thread都做了些什么：

static int
start_thread (void *arg)
{
  struct pthread *pd = (struct pthread *) arg;
 
  /* Initialize resolver state pointer.  */
  __resp = &pd->res;
 
  /* This is where the try/finally block should be created.  For
     compilers without that support we do use setjmp.  */
  struct pthread_unwind_buf unwind_buf;
 
  /* No previous handlers.  */
  unwind_buf.priv.data.prev = NULL;
  unwind_buf.priv.data.cleanup = NULL;
 
  int not_first_call;
  not_first_call = setjmp ((struct __jmp_buf_tag *) unwind_buf.cancel_jmp_buf);
  if (__builtin_expect (! not_first_call, 1))
    {
      /* Store the new cleanup handler info.  */
      THREAD_SETMEM (pd, cleanup_jmp_buf, &unwind_buf);
 
      if (__builtin_expect (pd->stopped_start, 0))
    {
      int oldtype = CANCEL_ASYNC ();
 
      /* Get the lock the parent locked to force synchronization.  */
      lll_lock (pd->lock, LLL_PRIVATE);
      /* And give it up right away.  */
      lll_unlock (pd->lock, LLL_PRIVATE);
 
      CANCEL_RESET (oldtype);
    }
 
      /* Run the code the user provided.  */
#ifdef CALL_THREAD_FCT
      THREAD_SETMEM (pd, result, CALL_THREAD_FCT (pd));
#else
      THREAD_SETMEM (pd, result, pd->start_routine (pd->arg));
#endif
    }
 
  /* Run the destructor for the thread-local data.  */
  __nptl_deallocate_tsd ();
 
  /* Clean up any state libc stored in thread-local variables.  */
  __libc_thread_freeres ();
 
  /* If this is the last thread we terminate the process now.  We
     do not notify the debugger, it might just irritate it if there
     is no thread left.  */
  if (__builtin_expect (atomic_decrement_and_test (&__nptl_nthreads), 0))
    /* This was the last thread.  */
    exit (0);
 
  /* The thread is exiting now.  Don't set this bit until after we've hit
     the event-reporting breakpoint, so that td_thr_get_info on us while at
     the breakpoint reports TD_THR_RUN state rather than TD_THR_ZOMBIE.  */
  atomic_bit_set (&pd->cancelhandling, EXITING_BIT);
 
  /* If the thread is detached free the TCB.  */
  if (IS_DETACHED (pd))
    /* Free the TCB.  */
    __free_tcb (pd);
  else if (__builtin_expect (pd->cancelhandling & SETXID_BITMASK, 0))
    {
      /* Some other thread might call any of the setXid functions and expect
     us to reply.  In this case wait until we did that.  */
      do
    lll_futex_wait (&pd->setxid_futex, 0, LLL_PRIVATE);
      while (pd->cancelhandling & SETXID_BITMASK);
 
      /* Reset the value so that the stack can be reused.  */
      pd->setxid_futex = 0;
    }
 
  /* We cannot call '_exit' here.  '_exit' will terminate the process.
 
     The 'exit' implementation in the kernel will signal when the
     process is really dead since 'clone' got passed the CLONE_CLEARTID
     flag.  The 'tid' field in the TCB will be set to zero.
 
     The exit code is zero since in case all threads exit by calling
     'pthread_exit' the exit status must be 0 (zero).  */
  __exit_thread_inline (0);
 
  /* NOTREACHED */
  return 0;
}

这部分主要逻辑就是执行用户指定的线程主函数（37行~41行），然后等结束。结束之后呢？在63行，如果线程是detach的，就会释放TCB，在__free_tcb函数中会将线程的栈释放。如果没有detach，那么栈就会被保留。

最后看看pthread_join函数（在glibc\nptl\pthread_join.c中）：

int
pthread_join (threadid, thread_return)
     pthread_t threadid;
     void **thread_return;
{
  struct pthread *pd = (struct pthread *) threadid;
 
  /* Make sure the descriptor is valid.  */
  if (INVALID_NOT_TERMINATED_TD_P (pd))
    /* Not a valid thread handle.  */
    return ESRCH;
 
  /* Is the thread joinable?.  */
  if (IS_DETACHED (pd))
    /* We cannot wait for the thread.  */
    return EINVAL;
 
  struct pthread *self = THREAD_SELF;
  int result = 0;
 
  /* During the wait we change to asynchronous cancellation.  If we
     are canceled the thread we are waiting for must be marked as
     un-wait-ed for again.  */
  pthread_cleanup_push (cleanup, &pd->joinid);
 
  /* Switch to asynchronous cancellation.  */
  int oldtype = CANCEL_ASYNC ();
 
  if ((pd == self
       || (self->joinid == pd
       && (pd->cancelhandling
           & (CANCELING_BITMASK | CANCELED_BITMASK | EXITING_BITMASK
          | TERMINATED_BITMASK)) == 0))
      && !CANCEL_ENABLED_AND_CANCELED (self->cancelhandling))
    /* This is a deadlock situation.  The threads are waiting for each
       other to finish.  Note that this is a "may" error.  To be 100%
       sure we catch this error we would have to lock the data
       structures but it is not necessary.  In the unlikely case that
       two threads are really caught in this situation they will
       deadlock.  It is the programmer's problem to figure this
       out.  */
    result = EDEADLK;
  /* Wait for the thread to finish.  If it is already locked something
     is wrong.  There can only be one waiter.  */
  else if (__builtin_expect (atomic_compare_and_exchange_bool_acq (&pd->joinid,
                                   self,
                                   NULL), 0))
    /* There is already somebody waiting for the thread.  */
    result = EINVAL;
  else
    /* Wait for the child.  */
    lll_wait_tid (pd->tid);
 
  /* Restore cancellation mode.  */
  CANCEL_RESET (oldtype);
 
  /* Remove the handler.  */
  pthread_cleanup_pop (0);
 
  if (__builtin_expect (result == 0, 1))
    {
      /* We mark the thread as terminated and as joined.  */
      pd->tid = -1;
 
      /* Store the return value if the caller is interested.  */
      if (thread_return != NULL)
    *thread_return = pd->result;
 
      /* Free the TCB.  */
      __free_tcb (pd);
    }
 
  return result;
}

基本就是释放内存这些活，最后又见__free_tcb函数。

不要怪glibc，这样做是有道理的，因为线程的返回值在TCB中，总要有人对此负责啊，你懂的，如果没人管，那就像僵尸进程了。不过僵尸进程还有个init这个老祖宗，线程就没有了，只能等整个进程结束了。所以内存就泄露了。