内存管理（二）物理内存分配以及slab分配器（读奔跑吧linux内核总结）

2022-11-21 22:02:28

一：物理页面分配

https://www.cnblogs.com/arnoldlu/p/8250734.html（参考）

linux内存管理是以页面为单位进行分配的，对内存的管理是通过伙伴系统管理的。

1.1：伙伴系统分配物理内存

分配物理内存的接口函数：alloc_pages（分配一个或者多个连续的物理页面，分配的页面只能是2的整数次页面，参数位一个为分配源码，一个为分配阶数）。

include\linux\gfp.h存放了GFP(Get Free Page)分配掩码，分配掩码可以分为两类：以__GFP_开头的分配掩码；以GFP_开头的一般是__GFP_的组合。__GFP_掩码分为两大类：zone modifiers和action modifiers，zone modifiers是掩码的低4位，用来指定从那个zone分配页面。

alloc_pages函数最终会调用函数__alloc_pages_nodemask，它是伙伴系统的核心函数，其次说明了这里的伙伴页面分配器是基于Zone的。

struct alloc_context是伙伴系统分配函数中用于保存相关参数的数据结构。gfp_zone()函数从分配掩码中计算出zone的zoneidx。

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
            struct zonelist *zonelist, nodemask_t *nodemask)
{
    struct zoneref *preferred_zoneref;
    struct page *page = NULL;
    unsigned int cpuset_mems_cookie;
    int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
    gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
    struct alloc_context ac = {
        .high_zoneidx = gfp_zone(gfp_mask),----------------------------------gfp_zone根据gfp_mask低4位，找到对应的zone_type。ZONE_NORMAL？ZONE_HIGHMEM？
        .nodemask = nodemask,
        .migratetype = gfpflags_to_migratetype(gfp_mask),--------------------根据gfp_mask得出页面migratetype，是MIGRATE_RECLAIMABLE？MIGRATE_MOVABLE？
    };
    gfp_mask &= gfp_allowed_mask;
    lockdep_trace_alloc(gfp_mask);
    might_sleep_if(gfp_mask & __GFP_WAIT);
    if (should_fail_alloc_page(gfp_mask, order))
        return NULL;
    /*
     * Check the zones suitable for the gfp_mask contain at least one
     * valid zone. It's possible to have an empty zonelist as a result
     * of GFP_THISNODE and a memoryless node
     */
    if (unlikely(!zonelist->_zonerefs->zone))
        return NULL;
    if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
        alloc_flags |= ALLOC_CMA;

retry_cpuset:
    cpuset_mems_cookie = read_mems_allowed_begin();

    /* We set it here, as __alloc_pages_slowpath might have changed it */
    ac.zonelist = zonelist;
    /* The preferred zone is used for statistics later */
    preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                ac.nodemask ? : &cpuset_current_mems_allowed,
                &ac.preferred_zone);
    if (!ac.preferred_zone)
        goto out;
    ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);

    /* First allocation attempt */
    alloc_mask = gfp_mask|__GFP_HARDWALL;
    page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);---------尝试分配物理页面需要判断从哪个zone中去分配内存，扫描节点查找适合分配的zone。然后返回页面
    if (unlikely(!page)) {
        /*
         * Runtime PM, block IO and its error handling path
         * can deadlock because I/O on the device might not
         * complete.
         */
        alloc_mask = memalloc_noio_flags(gfp_mask);
        page = __alloc_pages_slowpath(alloc_mask, order, &ac);-----------------如果分配失败，则在这里进行很多特殊场景的处理。
    }
    if (kmemcheck_enabled && page)
        kmemcheck_pagealloc_alloc(page, order, gfp_mask);
    trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

out:
    /*
     * When updating a task's mems_allowed, it is possible to race with
     * parallel threads in such a way that an allocation can fail while
     * the mask is being updated. If a page allocation is about to fail,
     * check if the cpuset changed during allocation and if so, retry.
     */
    if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
        goto retry_cpuset;--------------------------------------------------重试页面分配

    return page;
}

static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                        const struct alloc_context *ac)
{
    struct zonelist *zonelist = ac->zonelist;
    struct zoneref *z;
    struct page *page = NULL;
    struct zone *zone;
    nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
    int zlc_active = 0;        /* set if using zonelist_cache */
    int did_zlc_setup = 0;        /* just call zlc_setup() one time */
    bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
                (gfp_mask & __GFP_WRITE);
    int nr_fair_skipped = 0;
    bool zonelist_rescan;
zonelist_scan:-------------------------------------------------------------------开始检查ac->zonelist。
    zonelist_rescan = false;

    /*
     * Scan zonelist, looking for a zone with enough free.
     * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
     */
//判断从哪个zone中分配内存。，扫描节点zonelist去查找合适分配内存的zone。
    for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,--------从zonelist给定的ac->high_zoneidx开始查找，返回的是zone。
                                ac->nodemask) {

...-----------------------------------------------------------------------------一系列检查条件，不满足跳出当前for循环，进入下一个zone。满足的进入水位检查。
        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];-----------------这里的alloc_flags包含ALLOC_WMARK_LOW
        if (!zone_watermark_ok(zone, order, mark,-------------------------------所以此处会检查zone的低水位，不满足则进行检查，或者尝试zone_reclaim。
                       ac->classzone_idx, alloc_flags)) {
            int ret;
            /* Checked here to keep the fast path fast */
            BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
            if (alloc_flags & ALLOC_NO_WATERMARKS)
                goto try_this_zone;
...
            ret = zone_reclaim(zone, gfp_mask, order);-------------------------通过zone_reclaim进行一些页面回收
            switch (ret) {
...
                default:
                /* did we reclaim enough */
                if (zone_watermark_ok(zone, order, mark,
                        ac->classzone_idx, alloc_flags))---------------------再次检查水位是否满足
                    goto try_this_zone;
                /*
                 * Failed to reclaim enough to meet watermark.
                 * Only mark the zone full if checking the min
                 * watermark or if we failed to reclaim just
                 * 1<<order pages or else the page allocator
                 * fastpath will prematurely mark zones full
                 * when the watermark is between the low and
                 * min watermarks.
                 */
                if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
                    ret == ZONE_RECLAIM_SOME)
                    goto this_zone_full;
                continue;
            }
        }
try_this_zone:---------------------------------------------------------------包括水位各种条件都满足之后，可以在此zone进行页面分配工作。
        page = buffered_rmqueue(ac->preferred_zone, zone, order,-------------从zone中进行页面分配工作
                        gfp_mask, ac->migratetype);
        if (page) {
            if (prep_new_page(page, order, gfp_mask, alloc_flags))
                goto try_this_zone;
            return page;
        }
this_zone_full:
        if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
            zlc_mark_zone_full(zonelist, z);
    }
    /*
     * The first pass makes sure allocations are spread fairly within the
     * local node.  However, the local node might have free pages left
     * after the fairness batches are exhausted, and remote zones haven't
     * even been considered yet.  Try once more without fairness, and
     * include remote zones now, before entering the slowpath and waking
     * kswapd: prefer spilling to a remote zone over swapping locally.
     */
    if (alloc_flags & ALLOC_FAIR) {
        alloc_flags &= ~ALLOC_FAIR;
        if (nr_fair_skipped) {
            zonelist_rescan = true;
            reset_alloc_batches(ac->preferred_zone);
        }
        if (nr_online_nodes > 1)
            zonelist_rescan = true;
    }
    if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
        /* Disable zlc cache for second zonelist scan */
        zlc_active = 0;
        zonelist_rescan = true;
    }
    if (zonelist_rescan)
        goto zonelist_scan;
    return NULL;
}

for_each_zone_zonelist_nodemask->first_zone_zonelist_nodmask->first_zones_zonelist->nest_zones_zonelist来计算zoneref结构体，最后返回zone结构体。有函数所分配的掩码得到分配的zone区域。zonerefs[0]表示ZONE_HIGHMEM。zonerefs[1]表示ZONE_NORMAL。

1.2：水位的计算和设置：

下面看看判断当前zone空闲页面是否满足alloc_flags指定水位的函数__zone_watermark_ok。z-zone结构体，order待分配页面的阶数，mark水位数值，classzone_idx是zone序号，alloc_flags分配掩码，free_pages当前空闲页面数。

分配物理内存的内核路径是检查WMARK_LOW的水位，页面回收kswapd内核线程检查WMARK_HIGH水位，这导致了内存节点的zone的页面老化速度不一致。

static bool __zone_watermark_ok(struct zone *z, unsigned int order,
            unsigned long mark, int classzone_idx, int alloc_flags,
            long free_pages)
{
    /* free_pages may go negative - that's OK */
    long min = mark;
    int o;
    long free_cma = 0;

    free_pages -= (1 << order) - 1;---------------------------------------------减去待分配页面后剩余页面数，-1？？
    if (alloc_flags & ALLOC_HIGH)
        min -= min / 2;
    if (alloc_flags & ALLOC_HARDER)
        min -= min / 4;
...
    if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])--------空闲页面数要保证大于min值和lowmem_resreve保留值之和
        return false;
    for (o = 0; o < order; o++) {-----------------------------------------------遍历buddy中比当前请求分配order小的所有order，依次检查free pages是否满足watermark需求
        /* At the next order, this order's pages become unavailable */
        free_pages -= z->free_area[o].nr_free << o;-----------------------------从总free_pages种减去当前order的free pages

        /* Require fewer higher order pages to be free */
        min >>= 1;--------------------------------------------------------------水位值缩半

        if (free_pages <= min)--------------------------------------------------在比较是否满足水位需求
            return false;
    }
    return true;----------------------------------------------------------------以上所有条件都满足，返回True
}

通过zone_reclaim回收页面，然后通过buffered_rmqueue从伙伴系统中分配物理页面。

buffered_rmqueue根据order的值order=0，从zone->pageset中分配，order>0从伙伴系统中分配。

1.3：释放物理页面

__free_page
free_page-->free_pages
    __free_pages
        free_hot_cold_page
        __free_pages_ok

二：slab分配器

https://www.cnblogs.com/arnoldlu/p/8215414.html

伙伴系统中的分配是以page大小为单位的，对于内核中的连续小内存块的分配，如果继续采用也分配就会产生浪费，出现内存碎片化，所以我们这里采用了slab分配器。

slab分配器最终还是由伙伴系统来分配出实际的物理页面，只不过slab分配器在这些连续的物理页面上实现了自己的算法，以此来对小内存块进行管理。

作用：减小内存产生的内存碎片，将频繁使用的对象缓存起来，减少分配初始化，释放对象的开销，通过着色技术调整对象以更好的使用硬件高速缓存器。

slab分配器为每种对象分配一个高速缓存，这个缓存可以看做同类型对象的储备，每个高速缓存又被划分为多个slab，每个slab由一个或者多个连续的页框组成，每个页框包换若干对象。

（对象的概念：内核中的数据结构以及对该数据结构进行创建和撤销的操作）。

每个高速缓存器通过kmem_cache结构来描述，包含了对当前高速缓存各种属性的描述。每个kmem_cache结构中并不包含对具体slab的描述，而是通过kmem_list3结构组织各个 slab。

struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,---------创建slab描述符kmem_cache，此时并没有真正分配内存（只是创建描述符）
            unsigned long, void (*)(void *));
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);------------------分配slab缓存对象
void kmem_cache_free(struct kmem_cache *, void *);-------------------------释放slab缓存对象
void kmem_cache_destroy(struct kmem_cache *);-----------------------------销毁slab描述符

2.1：创建slab描述符

slab对象的描述符struct kmem_cache

struct kmem_cache {
    struct array_cache __percpu *cpu_cache; //本地cpu的对象缓冲池。

/* 1) Cache tunables. Protected by slab_mutex */
    unsigned int batchcount;-----------------------------------表示当前CPU本地缓冲池array_cache为空时，从共享缓冲池或者slabs_partial/slabs_free列表中获取对象的数目。
    unsigned int limit;----------------------------------------表示当本地对象缓冲池空闲对象数目大于limit时就会主动释放batchcount个对象，便于内核回收和销毁slab。
    unsigned int shared;

    unsigned int size;-----------------------------------------align过后的对象长度
    struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */

    unsigned int flags;        /* constant flags */------------分配掩码
    unsigned int num;        /* # of objs per slab */----------slab中有多少个对象

/* 3) cache_grow/shrink */
    /* order of pgs per slab (2^n) */
    unsigned int gfporder;------------------------------------此slab占用2^gfporder个页面

    /* force GFP flags, e.g. GFP_DMA */
    gfp_t allocflags;

    size_t colour;            /* cache colouring range */----一个slab有几个不同的cache line
    unsigned int colour_off;    /* colour offset */----------一个cache order的长度，和L1 Cache Line长度相同

    struct kmem_cache *freelist_cache;
    unsigned int freelist_size;

    /* constructor func */
    void (*ctor)(void *obj);

/* 4) cache creation/removal */
    const char *name;----------------------------------------slab描述符的名称
    struct list_head list;
    int refcount;--------------------------------------------被引用的次数，供slab描述符销毁参考
    int object_size;-----------------------------------------对象的实际大小
    int align;-----------------------------------------------对齐的大小

/* 5) statistics */
#ifdef CONFIG_DEBUG_SLAB
    unsigned long num_active;
    unsigned long num_allocations;
    unsigned long high_mark;
    unsigned long grown;
    unsigned long reaped;
    unsigned long errors;
    unsigned long max_freeable;
    unsigned long node_allocs;
    unsigned long node_frees;
    unsigned long node_overflow;
    atomic_t allochit;
    atomic_t allocmiss;
    atomic_t freehit;
    atomic_t freemiss;

    /*
     * If debugging is enabled, then the allocator can add additional
     * fields and/or padding to every object. size contains the total
     * object size including these internal fields, the following two
     * variables contain the offset to the user object and its size.
     */
    int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG_KMEM
    struct memcg_cache_params memcg_params;
#endif

    struct kmem_cache_node *node[MAX_NUMNODES];-------slab对应的节点的struct kmem_cache_node数据结构
}

本地缓冲池：

struct array_cache {
    unsigned int avail;-------------对象缓冲池中可用的对象数目
    unsigned int limit;
    unsigned int batchcount;
    unsigned int touched;----------从缓冲池移除一个对象时，touched置1；收缩缓存时，touched置0。
    void *entry[];-----------------保存对象的实体
};

主要的调用函数：kmem_cache_create（参数：name：slab描述符的名称，size：缓存对象的大小，align：对齐的大小，flags：分配掩码，ctor：对象的构造函数）

调用核心流程：kmem_cache_create-----------------------------进行合法性检查，以及是否有现成slab描述符可用
    do_kmem_cache_create----------------------将主要参数配置到slab描述符，然后将得到的描述符加入slab_caches全局链表中。
        __kmem_cache_create-------------------是创建slab描述符的核心进行对齐操作，字节长度对齐（一般为4byte）计算需要页面，align对齐的大小，alsb的状态，对象数目，对slab着色等等操作。
            calculate_slab_order--------------计算slab对象需要的大小，以及一个slab描述符需要多少page
            setup_cpu_cache-------------------继续配置slab描述符

struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
          unsigned long flags, void (*ctor)(void *))
{
...
    s = __kmem_cache_alias(name, size, align, flags, ctor);----------------检查是否有现成的slab描述符可用，有即跳转到out_unlock。
    if (s)
        goto out_unlock;
    cache_name = kstrdup_const(name, GFP_KERNEL);
    if (!cache_name) {
        err = -ENOMEM;
        goto out_unlock;
    }
    s = do_kmem_cache_create(cache_name, size, size,----------------------调用do_kmem_cache_create创建slab描述符
                 calculate_alignment(flags, align, size),
                 flags, ctor, NULL, NULL);
...
    return s;
}

计算slab的大小：static size_t calculate_slab_order(struct kmem_cache *cachep,
            size_t size, size_t align, unsigned long flags)
{
    unsigned long offslab_limit;
    size_t left_over = 0;
    int gfporder;

    for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {------从gfporder=0开始，直到KMALLOC_MAX_ORDER=10，即从4KB到4MB大小。
        unsigned int num;
        size_t remainder;

        cache_estimate(gfporder, size, align, flags, &remainder, &num);//来计算2^gfporder页面大小的情况下，可以容纳多少个obj对象，剩下的空间用于cache colour着色。
        if (!num)---------------------------------------------------------不等于0则表示gfporder已经满足条件，最低分配到一个size大小的对象。等于0则继续下一次for循环。
            continue;

        /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
        if (num > SLAB_OBJ_MAX_NUM)--------------------------------------slab中对象最大数目，SLAB_OBJ_MAX_NUM为255，所以所有的slab对象不超过255
            break;

        if (flags & CFLGS_OFF_SLAB) {
            size_t freelist_size_per_obj = sizeof(freelist_idx_t);
            /*
             * Max number of objs-per-slab for caches which
             * use off-slab slabs. Needed to avoid a possible
             * looping condition in cache_grow().
             */
            if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
                freelist_size_per_obj += sizeof(char);
            offslab_limit = size;
            offslab_limit /= freelist_size_per_obj;

             if (num > offslab_limit)
                break;
        }

        /* Found something acceptable - save it away */
        cachep->num = num;
        cachep->gfporder = gfporder;
        left_over = remainder;-------------------------------------------确定对象个数和需要的页面数
...
if (left_over * 8 <= (PAGE_SIZE << gfporder))-------------------满足着色条件，退出for循环。
            break;
        }
return left_over;
    }

2.3：分配slab对象

kmem_cache_alloc是slab分配缓存对象的核心函数，在slab分配缓存过程中是全程关闭本地中断的。

kmem_cache_alloc-->slab_alloc-->__do_cache_alloc是关中断的。

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    void *objp;
    struct array_cache *ac;
    bool force_refill = false;

    check_irq_off();

    ac = cpu_cache_get(cachep);----------------------------------------获取本地对象缓冲池
    if (likely(ac->avail)) {-------------------------------------------本地对象缓冲池是否有空闲对象
        ac->touched = 1;
        objp = ac_get_obj(cachep, ac, flags, false);-------------------从本地对象缓冲池中分配一个对象
        /*
         * Allow for the possibility all avail objects are not allowed
         * by the current flags
         */
        if (objp) {
            STATS_INC_ALLOCHIT(cachep);
            goto out;-------------------------------------------------如果成功获得objp，那么直接返回指针。
        }
        force_refill = true;
    }
    STATS_INC_ALLOCMISS(cachep);
    objp = cache_alloc_refill(cachep, flags, force_refill);------------是slab分配缓存的核心
    /*
     * the 'ac' may be updated by cache_alloc_refill(),
     * and kmemleak_erase() requires its correct value.
     */
    ac = cpu_cache_get(cachep);
out:
    /*
     * To avoid a false negative, if an object that is in one of the
     * per-CPU caches is leaked, we need to make sure kmemleak doesn't
     * treat the array pointers as a reference to the object.
     */
    if (objp)
        kmemleak_erase(&ac->entry[ac->avail]);
    return objp;
}

cache_alloc_refill是slab分配缓存的核心：

2.4：释放slab

lab释放对象通过kmem_cache_free进行，在释放过程中也是全程关中断的。

kmem_cache_free->kmem_cache_free->_cache_free(核心函数)

static inline void __cache_free(struct kmem_cache *cachep, void *objp,
                unsigned long caller)
{
    struct array_cache *ac = cpu_cache_get(cachep);----------------找到本地对象缓冲池

    check_irq_off();
    kmemleak_free_recursive(objp, cachep->flags);
    objp = cache_free_debugcheck(cachep, objp, caller);

    kmemcheck_slab_free(cachep, objp, cachep->object_size);
    if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
        return;
    if (ac->avail < ac->limit) {
        STATS_INC_FREEHIT(cachep);
    } else {
        STATS_INC_FREEMISS(cachep);
        cache_flusharray(cachep, ac);---------------------------------尝试回收空闲对象
    }
    ac_put_obj(cachep, ac, objp);-------------------------------------将对象释放到本地对象缓冲池ac中
}

2.5：kmalloc分配函数

分配机制是slab，分配的内存大小对齐到2^order个字节大小。在create_kmalloc_caches函数中完成，

start_kernel-->mm_init-->kmem_cache_init-->create_kmalloc_caches。

KMALLOC_MIN_SIZE=64 KMALLOC_SHIFT_LOW=6 KMALLOC_SHIFT_HIGH=13 KMALLOC_SHIFT_MAX=23

对于kmalloc尺寸小于192B从哪个slab描述符中分配缓存，进行了特殊的映射，static s8 size_index[24]

size_index的数值对应kmalloc_caches的下标，kmalloc_caches的内容由create_kmalloc_caches创建。

码农公寓

相关文章