map结构初探

maps是erlang新出的一种数据结构，传说用来替代record。这里主要说下maps的具体实现，并分析各种操作的时间复杂度（c层面）。并对优缺点做一个总结。

环境：版本为R17

map结构

typedef struct map_s {
    Eterm thing_word;
    Uint  size;
    Eterm keys;      /* tuple */
} map_t;
/* map node
 *
 * -----------
 * Eterm   THING
 * Uint    size
 * Eterm   Keys -> {K1, K2, K3, ..., Kn} where n = size
 * ----
 * Eterm   V1
 * ...
 * Eterm   Vn, where n = size
 * -----------
 */

这个结构定义在erl_map.h中，这里的注释已经对map的结构说的很清楚了。
所以在map的结构大概是这样。

在erl_map.c的注释中，可以看到此模块主要实现了下面的一系列maps模块的函数

maps:find/2
maps:from_list/1
maps:get/2
maps:is_key/2
maps:keys/1
maps:merge/2
maps:new/0
maps:put/3
maps:remove/2
maps:to_list/1
maps:update/3
maps:values/1

下面挑一些能够看清map结构的函数，做一个简单的分析。

maps:find/2

时间复杂度：O(N), 由下面代码可以看到在map中查找实际需要遍历整个keys，所以时间复杂度是O(N)

BIF_RETTYPE maps_find_2(BIF_ALIST_2) {
    if (is_map(BIF_ARG_2)) {
	Eterm *hp, value,res;

	if (erts_maps_find(BIF_ARG_1, BIF_ARG_2, &value)) {
	    hp    = HAlloc(BIF_P, 3);
	    res   = make_tuple(hp);
	    *hp++ = make_arityval(2);
	    *hp++ = am_ok;
	    *hp++ = value;
	    BIF_RET(res);
	}

	BIF_RET(am_error);
    }
    BIF_ERROR(BIF_P, BADARG);
}

int erts_maps_find(Eterm key, Eterm map, Eterm *value) {
    Eterm *ks,*vs;
    map_t *mp;
    Uint n,i;

    mp  = (map_t*)map_val(map);
    n   = map_get_size(mp);
    ks  = map_get_keys(mp);
    vs  = map_get_values(mp);

    //遍历所有的key，找到相等的key，O(N)
    for( i = 0; i < n; i++) {
	if (EQ(ks[i], key)) {
	    *value = vs[i];
	    return 1;
	}
    }
    return 0;
}

maps:from_list/1

可以看到，from_list的操作是有点费时的，最坏情况下能到O(N^2)。

BIF_RETTYPE maps_from_list_1(BIF_ALIST_1) {
    Eterm *kv, item = BIF_ARG_1;
    Eterm *hp, *thp,*vs, *ks, keys, res;
    map_t *mp;
    Uint  size = 0, unused_size = 0;
    Sint  c = 0;
    Sint  idx = 0;

    if (is_list(item) || is_nil(item)) {

	/* Calculate size and check validity */
	//检查所有的list是否符合规范，这里遍历一遍，并得到元素个数。
	while(is_list(item)) {
	    res = CAR(list_val(item));
	    if (is_not_tuple(res))
		goto error;

	    kv = tuple_val(res);
	    if (*kv != make_arityval(2))
		goto error;

	    size++;
	    item = CDR(list_val(item));
	}

	if (is_not_nil(item))
	    goto error;
	//分配用于存放maps，3是map_t结构大小，1是tuple占用一个size，2倍大小的size是key跟value，由于这里是从头开始构造map，所以这里固定了keys的位置。
	hp    = HAlloc(BIF_P, 3 + 1 + (2 * size));
	thp   = hp;
	keys  = make_tuple(hp);//构造一个tuple
	*hp++ = make_arityval(size);//填充tuple的大小
	ks    = hp;//keys的位置
	hp   += size;
	mp    = (map_t*)hp;//map结构指针
	res   = make_map(mp);
	hp   += MAP_HEADER_SIZE;
	vs    = hp;//values指针

	//填充map头
	mp->thing_word = MAP_HEADER;
	mp->size = size; /* set later, might shrink*/
	mp->keys = keys;

	if (size == 0)
	    BIF_RET(res);

	item  = BIF_ARG_1;

	//填充第一个元素的值
	/* first entry */
	kv    = tuple_val(CAR(list_val(item)));
	ks[0] = kv[1];
	vs[0] = kv[2];
	size  = 1;
	item  = CDR(list_val(item));

	/* insert sort key/value pairs */
	while(is_list(item)) {

	    kv = tuple_val(CAR(list_val(item)));

	    /* compare ks backwards
	     * idx represent word index to be written (hole position).
	     * We cannot copy the elements when searching since we might
	     * have an equal key. So we search for just the index first =(
	     *
	     * It is perhaps faster to move the values in the first pass.
	     * Check for uniqueness during insert phase and then have a
	     * second phace compacting the map if duplicates are found
	     * during insert. .. or do someother sort .. shell-sort perhaps.
	     */

	    idx = size;
		//相当于使用了插入排序，保持keys有序
	    while(idx > 0 && (c = CMP_TERM(kv[1],ks[idx-1])) < 0) { idx--; }

	    if (c == 0) {
		/* last compare was equal,
		 * i.e. we have to release memory
		 * and overwrite that key/value
		 */
        //如果key存在，直接改写数据就可以了
		ks[idx-1] = kv[1];
		vs[idx-1] = kv[2];
		unused_size++;
	    } else {
        //不存在，由于要保持有序，需要搬运后面的数据。
		Uint i = size;
		while(i > idx) {
		    ks[i] = ks[i-1];
		    vs[i] = vs[i-1];
		    i--;
		}
		ks[idx] = kv[1];
		vs[idx] = kv[2];
		size++;
	    }
	    item = CDR(list_val(item));
	}

	if (unused_size) {
	    /* the key tuple is embedded in the heap
	     * write a bignum to clear it.
	     */
	    /* release values as normal since they are on the top of the heap */

	    ks[size] = make_pos_bignum_header(unused_size - 1);
	    HRelease(BIF_P, vs + size + unused_size, vs + size);
	}

	*thp = make_arityval(size);
	mp->size = size;
	BIF_RET(res);
    }

error:

    BIF_ERROR(BIF_P, BADARG);
}

maps:get/2

由于erts_maps_get的时间效率是O(N),所以get的时间效率也是O(N)

BIF_RETTYPE maps_get_2(BIF_ALIST_2) {
    if (is_map(BIF_ARG_2)) {
	Eterm *hp;
	Eterm value, error;
	char *s_error;

	if (erts_maps_get(BIF_ARG_1, BIF_ARG_2, &value)) {
	    BIF_RET(value);
	}

	s_error = "bad_key";
	error = am_atom_put(s_error, sys_strlen(s_error));

	hp = HAlloc(BIF_P, 3);
	BIF_P->fvalue = TUPLE2(hp, error, BIF_ARG_1);
	BIF_ERROR(BIF_P, EXC_ERROR_2);
    }
    BIF_ERROR(BIF_P, BADARG);
}

int erts_maps_get(Eterm key, Eterm map, Eterm *value) {
    Eterm *ks,*vs;
    map_t *mp;
    Uint n,i;

    mp  = (map_t*)map_val(map);
    n   = map_get_size(mp);

    if (n == 0)
	return 0;

    ks  = map_get_keys(mp);
    vs  = map_get_values(mp);

    if (is_immed(key)) {
	for( i = 0; i < n; i++) {
	    if (ks[i] == key) {
		*value = vs[i];
		return 1;
	    }
	}
    }

    for( i = 0; i < n; i++) {
	if (EQ(ks[i], key)) {
	    *value = vs[i];
	    return 1;
	}
    }
    return 0;
}

maps:is_key/2

由代码可以看到还是需要遍历所有的key找到是否相等，时间效率O(N)

BIF_RETTYPE maps_is_key_2(BIF_ALIST_2) {
    if (is_map(BIF_ARG_2)) {
	Eterm *ks, key;
	map_t *mp;
	Uint n,i;

	mp  = (map_t*)map_val(BIF_ARG_2);
	key = BIF_ARG_1;
	n   = map_get_size(mp);
	ks  = map_get_keys(mp);

	if (n == 0)
	    BIF_RET(am_false);

	if (is_immed(key)) {
	    for( i = 0; i < n; i++) {
		if (ks[i] == key) {
		    BIF_RET(am_true);
		}
	    }
	}

	for( i = 0; i < n; i++) {
	    if (EQ(ks[i], key)) {
		BIF_RET(am_true);
	    }
	}
	BIF_RET(am_false);
    }
    BIF_ERROR(BIF_P, BADARG);
}

maps:keys/1

虽然得到keys只要O(1),但是构造list需要O(N)

BIF_RETTYPE maps_keys_1(BIF_ALIST_1) {
    if (is_map(BIF_ARG_1)) {
	Eterm *hp, *ks, res = NIL;
	map_t *mp;
	Uint n;

	mp  = (map_t*)map_val(BIF_ARG_1);
	n   = map_get_size(mp);

	if (n == 0)
	    BIF_RET(res);

	hp  = HAlloc(BIF_P, (2 * n));
	ks  = map_get_keys(mp);

	while(n--) {
	    res = CONS(hp, ks[n], res); hp += 2;
	}

	BIF_RET(res);
    }
    BIF_ERROR(BIF_P, BADARG);
}

maps:new/0

BIF_RETTYPE maps_new_0(BIF_ALIST_0) {
    Eterm* hp;
    Eterm tup;
    map_t *mp;
	//分配空间多分配1，是因为keys的结构是个tuple需要存放size
    hp    = HAlloc(BIF_P, (MAP_HEADER_SIZE + 1));
    tup   = make_tuple(hp);
    *hp++ = make_arityval(0);

    mp    = (map_t*)hp;
    mp->thing_word = MAP_HEADER;
    mp->size = 0;
    mp->keys = tup;

    BIF_RET(make_map(mp));
}

maps:put/3

函数先假设已经存在这个key，找到key对于的value，然后修改它。如果key不存在这个结构中，则按序复制相应的key，value对。

BIF_RETTYPE maps_put_3(BIF_ALIST_3) {
    if (is_map(BIF_ARG_3)) {
	BIF_RET(erts_maps_put(BIF_P, BIF_ARG_1, BIF_ARG_2, BIF_ARG_3));
    }
    BIF_ERROR(BIF_P, BADARG);
}

Eterm erts_maps_put(Process *p, Eterm key, Eterm value, Eterm map) {
    Sint n,i;
    Sint c = 0;
    Eterm* hp, *shp;
    Eterm *ks,*vs, res, tup;
    map_t *mp = (map_t*)map_val(map);

    n = map_get_size(mp);

    if (n == 0) {
	hp    = HAlloc(p, MAP_HEADER_SIZE + 1 + 2);
	tup   = make_tuple(hp);
	*hp++ = make_arityval(1);
	*hp++ = key;
	res   = make_map(hp);
	*hp++ = MAP_HEADER;
	*hp++ = 1;
	*hp++ = tup;
	*hp++ = value;

	return res;
    }

    ks  = map_get_keys(mp);
    vs  = map_get_values(mp);

    /* only allocate for values,
     * assume key-tuple will be intact
     */

    hp  = HAlloc(p, MAP_HEADER_SIZE + n);
    shp = hp; /* save hp, used if optimistic update fails */
    res = make_map(hp);
    *hp++ = MAP_HEADER;
    *hp++ = n;
    *hp++ = mp->keys;

	//对构造的新map进行值复制
    if (is_immed(key)) {
	for( i = 0; i < n; i ++) {
	    if (ks[i] == key) {
		*hp++ = value;
		vs++;
		c = 1;
	    } else {
		*hp++ = *vs++;
	    }
	}
    } else {
	for( i = 0; i < n; i ++) {
	    if (EQ(ks[i], key)) {
		*hp++ = value;
		vs++;
		c = 1;
	    } else {
		*hp++ = *vs++;
	    }
	}
    }
	//如果发现了key，直接返回
    if (c)
	return res;

    /* need to make a new tuple,
     * use old hp since it needs to be recreated anyway.
     */
    tup    = make_tuple(shp);
    *shp++ = make_arityval(n+1);

    hp    = HAlloc(p, 3 + n + 1);
    res   = make_map(hp);
    *hp++ = MAP_HEADER;
    *hp++ = n + 1;
    *hp++ = tup;

    ks  = map_get_keys(mp);
    vs  = map_get_values(mp);

    ASSERT(n >= 0);

    /* copy map in order */
    while (n && ((c = CMP_TERM(*ks, key)) < 0)) {
	*shp++ = *ks++;
	*hp++  = *vs++;
	n--;
    }

    *shp++ = key;
    *hp++  = value;

    ASSERT(n >= 0);

    while(n--) {
	*shp++ = *ks++;
	*hp++  = *vs++;
    }
    /* we have one word remaining
     * this will work out fine once we get the size word
     * in the header.
     */
    *shp = make_pos_bignum_header(0);
    return res;
}

总结

总的来说，在erlang的map结构实现的很简单，给我的感觉是很随意。各种操作都比较耗时。