go语言的动态内存

go语言动态内存的申请和释放设计来自于tcmalloc
主要数据结构：
MHeap:the malloc heap,管理page
MCentral:特定类型小对象共享的free list
MCache:线程本地小对象的共享free list

分配小对象

查找MCache相应大小的free list，如果free list非空，从free list中直接获取，
这种情况下不需要任何锁的开销
如果MCache的free list为空，则从MCentral获取一些free object
如果MCentral的free list为空，则从MHeap申请一些page，然后将page内存加入到相应MCentral的free list
如果MHeap缓存的page不足，则从操作系统申请一些page(至少1M)

分配大对象

大对象直接从MHeap从分配

申请动态内存

// Allocate an object of size bytes.
// Small objects are allocated from the per-P cache's free lists.
// Large objects (> 32 kB) are allocated straight from the heap.
func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
    if size == 0 {
    	return unsafe.Pointer(&zerobase)
    }
    size0 := size
 
    if flags&flagNoScan == 0 && typ == nil {
    	gothrow("malloc missing type")
    }
 
    // This function must be atomic wrt GC, but for performance reasons
    // we don't acquirem/releasem on fast path. The code below does not have
    // split stack checks, so it can't be preempted by GC.
    // Functions like roundup/add are inlined. And onM/racemalloc are nosplit.
    // If debugMalloc = true, these assumptions are checked below.
    if debugMalloc {
    	mp := acquirem()
    	if mp.mallocing != 0 {
    		gothrow("malloc deadlock")
    	}
    	mp.mallocing = 1
    	if mp.curg != nil {
    		mp.curg.stackguard0 = ^uintptr(0xfff) | 0xbad
    	}
    }
 
    c := gomcache()
    var s *mspan
    var x unsafe.Pointer
    if size <= maxSmallSize {
    	if flags&flagNoScan != 0 && size < maxTinySize {
    		// Tiny allocator.
    		//
    		// Tiny allocator combines several tiny allocation requests
    		// into a single memory block. The resulting memory block
    		// is freed when all subobjects are unreachable. The subobjects
    		// must be FlagNoScan (don't have pointers), this ensures that
    		// the amount of potentially wasted memory is bounded.
    		//
    		// Size of the memory block used for combining (maxTinySize) is tunable.
    		// Current setting is 16 bytes, which relates to 2x worst case memory
    		// wastage (when all but one subobjects are unreachable).
    		// 8 bytes would result in no wastage at all, but provides less
    		// opportunities for combining.
    		// 32 bytes provides more opportunities for combining,
    		// but can lead to 4x worst case wastage.
    		// The best case winning is 8x regardless of block size.
    		//
    		// Objects obtained from tiny allocator must not be freed explicitly.
    		// So when an object will be freed explicitly, we ensure that
    		// its size >= maxTinySize.
    		//
    		// SetFinalizer has a special case for objects potentially coming
    		// from tiny allocator, it such case it allows to set finalizers
    		// for an inner byte of a memory block.
    		//
    		// The main targets of tiny allocator are small strings and
    		// standalone escaping variables. On a json benchmark
    		// the allocator reduces number of allocations by ~12% and
    		// reduces heap size by ~20%.
    		tinysize := uintptr(c.tinysize)
    		if size <= tinysize {
    			tiny := unsafe.Pointer(c.tiny)
    			// Align tiny pointer for required (conservative) alignment.
    			if size&7 == 0 {
    				tiny = roundup(tiny, 8)
    			} else if size&3 == 0 {
    				tiny = roundup(tiny, 4)
    			} else if size&1 == 0 {
    				tiny = roundup(tiny, 2)
    			}
    			size1 := size + (uintptr(tiny) - uintptr(unsafe.Pointer(c.tiny)))
    			if size1 <= tinysize {
    				// The object fits into existing tiny block.
    				x = tiny
    				c.tiny = (*byte)(add(x, size))
    				c.tinysize -= uintptr(size1)
    				c.local_tinyallocs++
    				if debugMalloc {
    					mp := acquirem()
    					if mp.mallocing == 0 {
    						gothrow("bad malloc")
    					}
    					mp.mallocing = 0
    					if mp.curg != nil {
    						mp.curg.stackguard0 = mp.curg.stack.lo + _StackGuard
    					}
    					// Note: one releasem for the acquirem just above.
    					// The other for the acquirem at start of malloc.
    					releasem(mp)
    					releasem(mp)
    				}
    				return x
    			}
    		}
    		// Allocate a new maxTinySize block.
    		s = c.alloc[tinySizeClass]
    		v := s.freelist
    		if v == nil {
    			mp := acquirem()
    			mp.scalararg[0] = tinySizeClass
    			onM(mcacheRefill_m)
    			releasem(mp)
    			s = c.alloc[tinySizeClass]
    			v = s.freelist
    		}
    		s.freelist = v.next
    		s.ref++
    		//TODO: prefetch v.next
    		x = unsafe.Pointer(v)
    		(*[2]uint64)(x)[0] = 0
    		(*[2]uint64)(x)[1] = 0
    		// See if we need to replace the existing tiny block with the new one
    		// based on amount of remaining free space.
    		if maxTinySize-size > tinysize {
    			c.tiny = (*byte)(add(x, size))
    			c.tinysize = uintptr(maxTinySize - size)
    		}
    		size = maxTinySize
    	} else {
    		var sizeclass int8
    		if size <= 1024-8 {
    			sizeclass = size_to_class8[(size+7)>>3]
    		} else {
    			sizeclass = size_to_class128[(size-1024+127)>>7]
    		}
    		size = uintptr(class_to_size[sizeclass])
    		s = c.alloc[sizeclass]
    		v := s.freelist
    		if v == nil {
    			mp := acquirem()
    			mp.scalararg[0] = uintptr(sizeclass)
    			onM(mcacheRefill_m)
    			releasem(mp)
    			s = c.alloc[sizeclass]
    			v = s.freelist
    		}
    		s.freelist = v.next
    		s.ref++
    		//TODO: prefetch
    		x = unsafe.Pointer(v)
    		if flags&flagNoZero == 0 {
    			v.next = nil
    			if size > 2*ptrSize && ((*[2]uintptr)(x))[1] != 0 {
    				memclr(unsafe.Pointer(v), size)
    			}
    		}
    	}
    	c.local_cachealloc += intptr(size)
    } else {
    	mp := acquirem()
    	mp.scalararg[0] = uintptr(size)
    	mp.scalararg[1] = uintptr(flags)
    	onM(largeAlloc_m)
    	s = (*mspan)(mp.ptrarg[0])
    	mp.ptrarg[0] = nil
    	releasem(mp)
    	x = unsafe.Pointer(uintptr(s.start << pageShift))
    	size = uintptr(s.elemsize)
    }
 
    if flags&flagNoScan != 0 {
    	// All objects are pre-marked as noscan.
    	goto marked
    }
 
    // If allocating a defer+arg block, now that we've picked a malloc size
    // large enough to hold everything, cut the "asked for" size down to
    // just the defer header, so that the GC bitmap will record the arg block
    // as containing nothing at all (as if it were unused space at the end of
    // a malloc block caused by size rounding).
    // The defer arg areas are scanned as part of scanstack.
    if typ == deferType {
    	size0 = unsafe.Sizeof(_defer{})
    }
 
    // From here till marked label marking the object as allocated
    // and storing type info in the GC bitmap.
    {
    	arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
    	off := (uintptr(x) - arena_start) / ptrSize
    	xbits := (*uint8)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
    	shift := (off % wordsPerBitmapByte) * gcBits
    	if debugMalloc && ((*xbits>>shift)&(bitMask|bitPtrMask)) != bitBoundary {
    		println("runtime: bits =", (*xbits>>shift)&(bitMask|bitPtrMask))
    		gothrow("bad bits in markallocated")
    	}
 
    	var ti, te uintptr
    	var ptrmask *uint8
    	if size == ptrSize {
    		// It's one word and it has pointers, it must be a pointer.
    		*xbits |= (bitsPointer << 2) << shift
    		goto marked
    	}
    	if typ.kind&kindGCProg != 0 {
    		nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
    		masksize := nptr
    		if masksize%2 != 0 {
    			masksize *= 2 // repeated
    		}
    		masksize = masksize * pointersPerByte / 8 // 4 bits per word
    		masksize++                                // unroll flag in the beginning
    		if masksize > maxGCMask && typ.gc[1] != 0 {
    			// If the mask is too large, unroll the program directly
    			// into the GC bitmap. It's 7 times slower than copying
    			// from the pre-unrolled mask, but saves 1/16 of type size
    			// memory for the mask.
    			mp := acquirem()
    			mp.ptrarg[0] = x
    			mp.ptrarg[1] = unsafe.Pointer(typ)
    			mp.scalararg[0] = uintptr(size)
    			mp.scalararg[1] = uintptr(size0)
    			onM(unrollgcproginplace_m)
    			releasem(mp)
    			goto marked
    		}
    		ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
    		// Check whether the program is already unrolled.
    		if uintptr(atomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
    			mp := acquirem()
    			mp.ptrarg[0] = unsafe.Pointer(typ)
    			onM(unrollgcprog_m)
    			releasem(mp)
    		}
    		ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
    	} else {
    		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
    	}
    	if size == 2*ptrSize {
    		*xbits = *ptrmask | bitBoundary
    		goto marked
    	}
    	te = uintptr(typ.size) / ptrSize
    	// If the type occupies odd number of words, its mask is repeated.
    	if te%2 == 0 {
    		te /= 2
    	}
    	// Copy pointer bitmask into the bitmap.
    	for i := uintptr(0); i < size0; i += 2 * ptrSize {
    		v := *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
    		ti++
    		if ti == te {
    			ti = 0
    		}
    		if i == 0 {
    			v |= bitBoundary
    		}
    		if i+ptrSize == size0 {
    			v &^= uint8(bitPtrMask << 4)
    		}
 
    		*xbits = v
    		xbits = (*byte)(add(unsafe.Pointer(xbits), ^uintptr(0)))
    	}
    	if size0%(2*ptrSize) == 0 && size0 < size {
    		// Mark the word after last object's word as bitsDead.
    		*xbits = bitsDead << 2
    	}
    }
marked:
    if raceenabled {
    	racemalloc(x, size)
    }
 
    if debugMalloc {
    	mp := acquirem()
    	if mp.mallocing == 0 {
    		gothrow("bad malloc")
    	}
    	mp.mallocing = 0
    	if mp.curg != nil {
    		mp.curg.stackguard0 = mp.curg.stack.lo + _StackGuard
    	}
    	// Note: one releasem for the acquirem just above.
    	// The other for the acquirem at start of malloc.
    	releasem(mp)
    	releasem(mp)
    }
 
    if debug.allocfreetrace != 0 {
    	tracealloc(x, size, typ)
    }
 
    if rate := MemProfileRate; rate > 0 {
    	if size < uintptr(rate) && int32(size) < c.next_sample {
    		c.next_sample -= int32(size)
    	} else {
    		mp := acquirem()
    		profilealloc(mp, x, size)
    		releasem(mp)
    	}
    }
 
    if memstats.heap_alloc >= memstats.next_gc {
    	gogc(0)
    }
 
    return x
}

释放动态内存

go不存在类似C中的free函数，动态内存的释放是由GC进行的，每次释放不是单独一个对象，而是一个span中n个对象

// Free n objects from a span s back into the central free list c.
// Called during sweep.
// Returns true if the span was returned to heap.  Sets sweepgen to
// the latest generation.
// If preserve=true, don't return the span to heap nor relink in MCentral lists;
// caller takes care of it.
bool
runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end, bool preserve)
{
    bool wasempty;
 
    if(s->incache)
    	runtime·throw("freespan into cached span");
 
    // Add the objects back to s's free list.
    wasempty = s->freelist == nil;
    end->next = s->freelist;
    s->freelist = start;
    s->ref -= n;
 
    if(preserve) {
    	// preserve is set only when called from MCentral_CacheSpan above,
    	// the span must be in the empty list.
    	if(s->next == nil)
    		runtime·throw("can't preserve unlinked span");
    	runtime·atomicstore(&s->sweepgen, runtime·mheap.sweepgen);
    	return false;
    }
 
    runtime·lock(&c->lock);
 
    // Move to nonempty if necessary.
    if(wasempty) {
    	runtime·MSpanList_Remove(s);
    	runtime·MSpanList_Insert(&c->nonempty, s);
    }
 
    // delay updating sweepgen until here.  This is the signal that
    // the span may be used in an MCache, so it must come after the
    // linked list operations above (actually, just after the
    // lock of c above.)
    runtime·atomicstore(&s->sweepgen, runtime·mheap.sweepgen);
 
    if(s->ref != 0) {
    	runtime·unlock(&c->lock);
    	return false;
    }
 
    // s is completely freed, return it to the heap.
    runtime·MSpanList_Remove(s);
    s->needzero = 1;
    s->freelist = nil;
    runtime·unlock(&c->lock);
    runtime·unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
    runtime·MHeap_Free(&runtime·mheap, s, 0);
    return true;
}