Golang---基本类型(map)

摘要：今天我们来学习 Golang 中的另外一种常用的数据类型，通过数据结构和源码来分析 golang 中的 map 是如何实现的。

数据结构

bucketCntBits = 3
bucketCnt = 1 << bucketCntBits

// Maximum average load of a bucket that triggers growth is 6.5.
// Represent as loadFactorNum/loadFactorDen, to allow integer math.
loadFactorNum = 13
loadFactorDen = 2

type hmap struct {
    count      int      //哈希表中元素数量
    flags      uint8  //标志位，每一位有特定的含义，比如写标识
    B          uint8  //bucket 的数量，len(buckets) = pow(2, B)
    noverflow  uint16
    hash0      uint32  //传入 Hash 函数的哈希种子
    buckets    *bmap   //指向 bmap 结构体数组的指针
    oldbuckets *bmap   //哈希扩容时候用到，和 Redis 哈希扩容策略相同
    nevacuate  uintptr
    extra      unsafe.Pointer // *mapextra
 }


type bmap struct {
    topbits [8]uint8 //存储哈希值的高 8 位
    keys[8] keytype  //存储 8 个 key
    values[8] valuetype  //存储 8 个 value
    pad      uintptr  //内存对齐
    overflow uintptr  //溢出桶指针，能减少扩容频率
}

// mapextra holds fields that are not present on all maps.
type mapextra struct {
    // If both key and elem do not contain pointers and are inline, then we mark bucket
    // type as containing no pointers. This avoids scanning such maps.
    // However, bmap.overflow is a pointer. In order to keep overflow buckets
    // alive, we store pointers to all overflow buckets in hmap.extra.overflow and hmap.extra.oldoverflow.
    // overflow and oldoverflow are only used if key and elem do not contain pointers.
    // overflow contains overflow buckets for hmap.buckets.
    // oldoverflow contains overflow buckets for hmap.oldbuckets.
    // The indirection allows to store a pointer to the slice in hiter.
    overflow* [] * bmap
    oldoverflow* [] * bmap

    // nextOverflow holds a pointer to a free overflow bucket.
    nextOverflow* bmap
}

map-struct

map 的创建

// makemap implements Go map creation for make(map[k]v, hint).
// If the compiler has determined that the map or the first bucket
// can be created on the stack, h and/or bucket may be non-nil.
// If h != nil, the map can be created directly in h.
// If h.buckets != nil, bucket pointed to can be used as the first bucket.
func makemap(t *maptype, hint int, h *hmap) *hmap {
    //一些检查
    mem, overflow := math.MulUintptr(uintptr(hint), t.bucket.size)
    if overflow || mem > maxAlloc {
        hint = 0
    }

    // initialize Hmap
    if h == nil {
        h = new(hmap)
    }
    //生成一个 hash种子
    h.hash0 = fastrand()

    // Find the size parameter B which will hold the requested # of elements.
    // For hint < 0 overLoadFactor returns false since hint < bucketCnt.
    //设置初始值 B = 0
    B := uint8(0)
    //如果创建 map 时候的时候指定了元素的个数，则会选择一个合适的 B 值
    for overLoadFactor(hint, B) {
        B++
    }
    h.B = B

    // allocate initial hash table
    // if B == 0, the buckets field is allocated lazily later (in mapassign)
    // If hint is large zeroing this memory could take a while.
    // 如果有扩容，则分配溢出桶，这是一种懒加载的思想，需要的时候才赋值
    if h.B != 0 {
        var nextOverflow *bmap
        h.buckets, nextOverflow = makeBucketArray(t, h.B, nil)
        if nextOverflow != nil {
            h.extra = new(mapextra)
            h.extra.nextOverflow = nextOverflow
        }
    }

    return h
}

map-create

map 的查找

我们有两种访问 hash 的方式：

//method1
val := hash[key]
//method2
val, ok := hash[key]

上面两种方式调用的底层函数是不同的。当接收参数仅为一个时，会调用 runtime.mapaccess1; 当接收函数为两个参数时，会使用 runtime.mapaccess2;这两个的代码实现基本类似，我们就拿 mapaccess1 来进行分析：

func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
    if raceenabled && h != nil {
        callerpc := getcallerpc()
        pc := funcPC(mapaccess1)
        racereadpc(unsafe.Pointer(h), callerpc, pc)
        raceReadObjectPC(t.key, key, callerpc, pc)
    }
    if msanenabled && h != nil {
        msanread(key, t.key.size)
    }
    if h == nil || h.count == 0 {
        if t.hashMightPanic() {
            t.hasher(key, 0) // see issue 23734
        }
        return unsafe.Pointer(&zeroVal[0])
    }
    if h.flags&hashWriting != 0 {
        throw("concurrent map read and map write")
    }
    hash := t.hasher(key, uintptr(h.hash0))
    // 如果 B==5, m = 0001 1111, 后面用 m 来定位桶
    // 因为 len(buckets) = pow(2, B), 
    m := bucketMask(h.B)
    //hash&m 得到 key 的 hash值的最后 B位
    // b 通过基地址+偏移量的方式 定位到 桶起始地址
    b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
    if c := h.oldbuckets; c != nil {
        if !h.sameSizeGrow() {
            // There used to be half as many buckets; mask down one more power of two.
            m >>= 1
        }
        oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
        if !evacuated(oldb) {
            b = oldb
        }
    }
    //top 为 hash值的 高8位，后续用来定位一个桶中的 bmap 结构(一个桶后面是一个 bmap结构的 链表)
    top := tophash(hash)
bucketloop:
    //外层循环遍历一个桶中的所有 bmap 结构
    for ; b != nil; b = b.overflow(t) {
        //内层循环遍历每个 bmap结构中的 8 个 key/value
        for i := uintptr(0); i < bucketCnt; i++ {
            //每个 bmap 都有一个 [bucketCnt]uint8 的数组，分别存放每个 key hash值的高8位
            if b.tophash[i] != top {
                if b.tophash[i] == emptyRest {
                    break bucketloop
                }
                continue
            }
            //定位到一个 bmap 中的 key 的位置
            k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
            //如果 key 是指针，取出所指向的值
            if t.indirectkey() {
                k = *((*unsafe.Pointer)(k))
            }
            //如果取出的 k 和要查找的 key 相同(只是根据高8位查找的，存在不相同的情况)
            if t.key.equal(key, k) {
                //取出 key 对应的 value
                e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
                if t.indirectelem() {
                    e = *((*unsafe.Pointer)(e))
                }
                return e
            }
        }
    }
    return unsafe.Pointer(&zeroVal[0])
}

map-find

map 的赋值

map 的赋值过程基本和上述的查找过程一致，首先用上述思想查找到当前的 key, 如果找到，则更新当前key 对应的 value, 如果找不到，则需要选择合适的插入位置，进行插入操作，源码如下：

/ / Like mapaccess, but allocates a slot for the key if it is not present in the map.
func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
    if h == nil {
        panic(plainError("assignment to entry in nil map"))
    }
    if raceenabled {
        callerpc := getcallerpc()
        pc := funcPC(mapassign)
        racewritepc(unsafe.Pointer(h), callerpc, pc)
        raceReadObjectPC(t.key, key, callerpc, pc)
    }
    if msanenabled {
        msanread(key, t.key.size)
    }
    if h.flags&hashWriting != 0 {
        throw("concurrent map writes")
    }
    hash := t.hasher(key, uintptr(h.hash0))

    // Set hashWriting after calling t.hasher, since t.hasher may panic,
    // in which case we have not actually done a write.
    h.flags ^= hashWriting

    //如果当前 map 为空，则创建一个桶
    if h.buckets == nil {
        h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
    }

again:
    //根据key 的 hash值的后B位 定位到所在桶
    bucket := hash & bucketMask(h.B)
    //如果正在扩容，则执行搬迁操作，一次搬一个或者两个(map 的迁移是渐进式的，不是一次迁移全部的 key)
    //这种思想和 Redis 中的 map 迁移思想相同
    if h.growing() {
        growWork(t, h, bucket)
    }
    //定位到当前桶
    b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
    //获取到hash值的高8位
    top := tophash(hash)

    var inserti *uint8      //将要插入的位置
    var insertk unsafe.Pointer  //将要插入的 key
    var elem unsafe.Pointer  //将要插入的 value
bucketloop:
    //下面这两个 for 循环思想和上面查找的思想一致
    //外层对 bmap 链表进行遍历
    //内层对 bmap 中的 8 个 key 进行对比查找
    for {
        for i := uintptr(0); i < bucketCnt; i++ {
            if b.tophash[i] != top {
                //如果当前槽位没有放置 key, 则准备将该元素放置到该位置，首先进行预赋值，但并未真正插入
                if isEmpty(b.tophash[i]) && inserti == nil {
                    inserti = &b.tophash[i]
                    insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
                    elem = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
                }
                if b.tophash[i] == emptyRest {
                    break bucketloop
                }
                continue
            }
            //如果有匹配的，则进一步比较 key 是否相同
            k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
            if t.indirectkey() {
                k = *((*unsafe.Pointer)(k))
            }
            //如果 key 不相同，则接着往下找
            if !t.key.equal(key, k) {
                continue
            }
            // already have a mapping for key. Update it.
            //如果 key 相等，则更新该 key 对应的 value
            if t.needkeyupdate() {
                typedmemmove(t.key, k, key)
            }
            elem = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
            goto done
        }
        //如果当前 bmap 没找到，继续往下一个 bmap 找，直到找完所有桶(包括溢出桶)
        ovf := b.overflow(t)
        if ovf == nil {
            break
        }
        b = ovf
    }

    // Did not find mapping for key. Allocate new cell & add entry.

    // If we hit the max load factor or we have too many overflow buckets,
    // and we're not already in the middle of growing, start growing.
    // 到此处说明未找到，需要插入一个新的 key/value
    // 如果负载因子超过阈值(6.5) 或者 溢出桶太多 noverflow >= (1 << (B & 15))
    if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
        // 对不同的扩容情况，执行具体的扩容策略(加倍扩容/等量扩容)
        hashGrow(t, h)
        //重新执行上述流程
        goto again // Growing the table invalidates everything, so try again
    }

    if inserti == nil {
        // all current buckets are full, allocate a new one.
        // 如果所有位置都是满的，则分配一个新的溢出桶，然后把 key/value 放到第一个位置
        newb := h.newoverflow(t, b)
        inserti = &newb.tophash[0]
        insertk = add(unsafe.Pointer(newb), dataOffset)
        elem = add(insertk, bucketCnt*uintptr(t.keysize))
    }

    // store new key/elem at insert position
    // 如果 key 是指针
    if t.indirectkey() {
        kmem := newobject(t.key)
        *(*unsafe.Pointer)(insertk) = kmem
        insertk = kmem
    }
    //如果 value 是指针
    if t.indirectelem() {
        vmem := newobject(t.elem)
        *(*unsafe.Pointer)(elem) = vmem
    }
    typedmemmove(t.key, insertk, key)
    // 把该 key hash值的高8位赋值，然后元素个数 +1
    *inserti = top
    h.count++

done:
    //再次检测在此执行期间是否有别的 goroutine 对这个 map 执行了写操作
    if h.flags&hashWriting == 0 {
        throw("concurrent map writes")
    }
    //把 flags 中对应的 hashWriting 标志清空
    h.flags &^= hashWriting
    if t.indirectelem() {
        elem = *((*unsafe.Pointer)(elem))
    }
    return elem
}

map-set

从上可以知道：map 在两种情况下会发生扩容，1：装载因子已经超过 6.5；2：哈希使用了太多的溢出桶(至于什么是 ”太多“，则需要看相应扩容源码部分)

map 的扩容

func hashGrow(t *maptype, h *hmap) {
    // If we've hit the load factor, get bigger.
    // Otherwise, there are too many overflow buckets,
    // so keep the same number of buckets and "grow" laterally.
    // 首先另 bigger + 1, 这种是加倍扩容的情景
    bigger := uint8(1)
    //如果不是负载因子的原因，则执行等量扩充
    if !overLoadFactor(h.count+1, h.B) {
        bigger = 0
        h.flags |= sameSizeGrow
    }
    //将当前的元素放入到 oldbuckets 中
    oldbuckets := h.buckets
    //新申请一个 bucket 数组，容量是 pow(2, B+(0/1))
    newbuckets, nextOverflow := makeBucketArray(t, h.B+bigger, nil)

    flags := h.flags &^ (iterator | oldIterator)
    if h.flags&iterator != 0 {
        flags |= oldIterator
    }
    // commit the grow (atomic wrt gc)
    // 对 bmap 中的元素进行更新
    h.B += bigger
    h.flags = flags
    h.oldbuckets = oldbuckets
    h.buckets = newbuckets
    h.nevacuate = 0
    h.noverflow = 0

    if h.extra != nil && h.extra.overflow != nil {
        // Promote current overflow buckets to the old generation.
        if h.extra.oldoverflow != nil {
            throw("oldoverflow is not nil")
        }
        h.extra.oldoverflow = h.extra.overflow
        h.extra.overflow = nil
    }
    if nextOverflow != nil {
        if h.extra == nil {
            h.extra = new(mapextra)
        }
        h.extra.nextOverflow = nextOverflow
    }

    // the actual copying of the hash table data is done incrementally
    // by growWork() and evacuate().
}

map-grow

等量扩容: 当溢出桶的个数太多时，执行等量扩容操作，原因是对这个 map 执行了大量的添加删除操作，但是 key 的个数并没有增多，负载因子也并没有达到阈值(6.5), 想象这样一种场景：在 map 中插入删除了大量的 key, 导致正正常 bucket 中的元素都是空的，但是溢出桶中的元素都是满的，这时，我们并不需要分配额外的空间，只需要重新生成一个等量的 bucket 数组，然后对上述 oldbuckets 中的元素进行重新 hash, 然后放入 buckets 就可以了(从新执行以下 hash 映射就可以)。

参考资料

【 Go 夜读】 #44 Go map 源码阅读分析