From c8f5d1580b96301ff2e24cffffa5495c28414502 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Thu, 7 May 2026 16:28:12 +0100 Subject: [PATCH 01/66] arm_mpam: Fix monitor instance selection when checking for hardware NRDY ANBZ: #36714 commit 1ef2a89584b7b788b2603590d886db076b2f24cc upstream. In _mpam_ris_hw_probe_hw_nrdy() a new register value to select the first monitor and relevant RIS is prepared in mon_sel. However, it is written to the monitor value register, e.g. MSMON_CSU, rather than MSMON_CFG_MON_SEL. As MSMON_CFG_MON_SEL is a 32 bit register update the type of mon_sel to u32. Write mon_sel to the intended register, MSMON_CFG_MON_SEL. Fixes: 8c90dc68a5de ("arm_mpam: Probe the hardware features resctrl supports") Cc: Signed-off-by: Ben Horgan Reviewed-by: James Morse Signed-off-by: Catalin Marinas Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 41b14344b16f..817cb10a8e79 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -731,7 +731,7 @@ static void mpam_enable_quirks(struct mpam_msc *msc) static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) { u32 now; - u64 mon_sel; + u32 mon_sel; bool can_set, can_clear; struct mpam_msc *msc = ris->vmsc->msc; @@ -740,7 +740,7 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, 0) | FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); - _mpam_write_monsel_reg(msc, mon_reg, mon_sel); + mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); _mpam_write_monsel_reg(msc, mon_reg, MSMON___NRDY); now = _mpam_read_monsel_reg(msc, mon_reg); -- Gitee From 60a8ffac75bff033061a7040e585cd688dec60c6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 8 May 2026 17:23:38 +0100 Subject: [PATCH 02/66] arm_mpam: Fix false positive assert failure during mpam_disable() ANBZ: #36714 commit f1caff3335ea6eab88cdc84ec8f2e3c45ca05486 upstream. mpam_assert_partid_sizes_fixed() is used to document that the caller doesn't expect the discovered PARTID size to change while it is walking a list sized by PARTID. Typically the MSC state is not written to until all the MSC have been discovered and this value is set. However, if discovering the MSC fails and schedules mpam_disable(), then the MSC state is written to reset it. In this case the discovered PARTID size may be become smaller - but only PARTID 0 will be used once resctrl_exit() has been called. Skip the WARN_ON_ONCE() if mpam_disable_reason has been set. Fixes: 3bd04fe7d807 ("arm_mpam: Extend reset logic to allow devices to be reset any time") Cc: Signed-off-by: James Morse Reviewed-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 817cb10a8e79..812c27b4dc04 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -164,11 +164,17 @@ static void mpam_free_garbage(void) /* * Once mpam is enabled, new requestors cannot further reduce the available * partid. Assert that the size is fixed, and new requestors will be turned - * away. + * away. This is needed when walking over structures sized by PARTID. + * + * During mpam_disable() these structures are not fixed, but the MSC state + * is still reset using whatever sizes have been discovered so far. As only + * PARTID 0 will be used after mpam_disable(), any race would be benign. + * Skip the check if a mpam_disable_reason has been set. */ static void mpam_assert_partid_sizes_fixed(void) { - WARN_ON_ONCE(!partid_max_published); + if (!mpam_disable_reason) + WARN_ON_ONCE(!partid_max_published); } static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg) -- Gitee From 7d0f1ec4abc5d168f7aaa9fbc3206ae6bad4914c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 8 May 2026 17:23:39 +0100 Subject: [PATCH 03/66] arm_mpam: Check whether the config array is allocated before destroying it ANBZ: #36714 commit 6ccbb613b42a1f1ba7bfd547a148f644a902a25c upstream. __destroy_component_cfg() is called to free the configuration array. It uses the embedded 'garbage' structure, which means the array has to be allocated. If __destroy_component_cfg() is called from mpam_disable() before the configuration was ever allocated, then a NULL pointer is dereferenced. Check for this case and return early if the configuration is not allocated. __destroy_component_cfg() also frees the mbwu_state as this is allocated by __allocate_component_cfg(). As the mbwu_state is allocated after comp->cfg is set, and is also under mpam_list_lock, only the first pointer needs checking. Fixes: 3bd04fe7d807 ("arm_mpam: Extend reset logic to allow devices to be reset any time") Cc: Signed-off-by: James Morse Reviewed-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 812c27b4dc04..c11b611b0128 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2591,6 +2591,9 @@ static void __destroy_component_cfg(struct mpam_component *comp) lockdep_assert_held(&mpam_list_lock); + if (!comp->cfg) + return; + add_to_garbage(comp->cfg); list_for_each_entry(vmsc, &comp->vmsc, comp_list) { msc = vmsc->msc; -- Gitee From 85357902bc61c1a7c48f78833bbb91a7130885fd Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 7 Sep 2021 17:21:42 +0100 Subject: [PATCH 04/66] NVIDIA: SAUCE: untested: arm_mpam: resctrl: pick classes for use as mbm counters ANBZ: #36714 commit d12ec41cde40faf58bcd18b19d8bbf8b49478a9c NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 resctrl has two types of counters, NUMA-local and global. MPAM has only bandwidth counters, but the position of the MSC may mean it counts NUMA-local, or global traffic. But the topology information is not available. Apply a heuristic: the L2 or L3 supports bandwidth monitors, these are probably NUMA-local. If the memory controller supports bandwidth monitors, they are probably global. This also allows us to assert that we don't have the same class backing two different resctrl events. Because the class or component backing the event may not be 'the L3', it is necessary for mpam_resctrl_get_domain_from_cpu() to search the monitor domains too. This matters the most for 'monitor only' systems, where 'the L3' control domains may be empty, and the ctrl_comp pointer NULL. Signed-off-by: James Morse (cherry picked from commit 40e0b0792745d65ea76f7b28f2642c590fe4dd9a https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - mon_comp[] is defined in upstream. Remove its definition in this patch. - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 57 ++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 226ff6f532fa..a5463c59910b 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -75,6 +75,14 @@ static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); */ static bool resctrl_enabled; +/* Whether this num_mbw_mon could result in a free_running system */ +static int __mpam_monitors_free_running(u16 num_mbwu_mon) +{ + if (num_mbwu_mon >= resctrl_arch_system_num_rmid_idx()) + return resctrl_arch_system_num_rmid_idx(); + return 0; +} + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -606,6 +614,24 @@ static bool cache_has_usable_csu(struct mpam_class *class) return true; } +static bool class_has_usable_mbwu(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_mbwu, cprops)) + return false; + + /* + * resctrl expects the bandwidth counters to be free running, + * which means we need as many monitors as resctrl has + * control/monitor groups. + */ + if (!__mpam_monitors_free_running(cprops->num_mbwu_mon)) + return false; + + return true; +} + /* * Calculate the worst-case percentage change from each implemented step * in the control. @@ -949,6 +975,7 @@ static void counter_update_class(enum resctrl_event_id evt_id, static void mpam_resctrl_pick_counters(void) { struct mpam_class *class; + bool has_mbwu; lockdep_assert_cpus_held(); @@ -983,7 +1010,37 @@ static void mpam_resctrl_pick_counters(void) break; } } + + has_mbwu = class_has_usable_mbwu(class); + if (has_mbwu && topology_matches_l3(class)) { + pr_debug("class %u has usable MBWU, and matches L3 topology", class->level); + + /* + * MBWU counters may be 'local' or 'total' depending on + * where they are in the topology. Counters on caches + * are assumed to be local. If it's on the memory + * controller, its assumed to be global. + * TODO: check mbm_local matches NUMA boundaries... + */ + switch (class->type) { + case MPAM_CLASS_CACHE: + counter_update_class(QOS_L3_MBM_LOCAL_EVENT_ID, + class); + break; + case MPAM_CLASS_MEMORY: + counter_update_class(QOS_L3_MBM_TOTAL_EVENT_ID, + class); + break; + default: + break; + } + } } + + /* Allocation of MBWU monitors assumes that the class is unique... */ + if (mpam_resctrl_counters[QOS_L3_MBM_LOCAL_EVENT_ID].class) + WARN_ON_ONCE(mpam_resctrl_counters[QOS_L3_MBM_LOCAL_EVENT_ID].class == + mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) -- Gitee From 28beb7649af2cda28b2cf28398283e76469e050b Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 15 Jul 2025 15:39:36 +0100 Subject: [PATCH 05/66] NVIDIA: SAUCE: arm_mpam: resctrl: Pre-allocate free running monitors ANBZ: #36714 commit 11de6cffe0ef720a2da674b18c1780db50f99633 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 When there are enough monitors, the resctrl mbm local and total files can be exposed. These need all the monitors that resctrl may use to be allocated up front. Add helpers to do this. If a different candidate class is discovered, the old array should be free'd and the allocated monitors returned to the driver. Signed-off-by: James Morse (cherry picked from commit 355bc5f578a4f17887f2574191c01fae5202abd7 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_internal.h | 8 ++- drivers/resctrl/mpam_resctrl.c | 92 +++++++++++++++++++++++++++++++-- 2 files changed, 94 insertions(+), 6 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 1914aefdcba9..963f7bf74ce6 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -411,7 +411,13 @@ struct mpam_resctrl_res { struct mpam_resctrl_mon { struct mpam_class *class; - /* per-class data that resctrl needs will live here */ + /* + * Array of allocated MBWU monitors, indexed by (closid, rmid). + * When ABMC is not in use, this array directly maps (closid, rmid) + * to the allocated monitor. Otherwise this array is sparse, and + * un-assigned (closid, rmid) are -1. + */ + int *mbwu_idx_to_mon; }; static inline int mpam_alloc_csu_mon(struct mpam_class *class) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index a5463c59910b..82801feb5211 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -626,10 +626,12 @@ static bool class_has_usable_mbwu(struct mpam_class *class) * which means we need as many monitors as resctrl has * control/monitor groups. */ - if (!__mpam_monitors_free_running(cprops->num_mbwu_mon)) - return false; + if (__mpam_monitors_free_running(cprops->num_mbwu_mon)) { + pr_debug("monitors usable in free-running mode\n"); + return true; + } - return true; + return false; } /* @@ -951,10 +953,58 @@ static void mpam_resctrl_pick_mba(void) } } +static void __free_mbwu_mon(struct mpam_class *class, int *array, + u16 num_mbwu_mon) +{ + for (int i = 0; i < num_mbwu_mon; i++) { + if (array[i] < 0) + continue; + + mpam_free_mbwu_mon(class, array[i]); + array[i] = ~0; + } +} + +static int __alloc_mbwu_mon(struct mpam_class *class, int *array, + u16 num_mbwu_mon) +{ + for (int i = 0; i < num_mbwu_mon; i++) { + int mbwu_mon = mpam_alloc_mbwu_mon(class); + + if (mbwu_mon < 0) { + __free_mbwu_mon(class, array, num_mbwu_mon); + return mbwu_mon; + } + array[i] = mbwu_mon; + } + + return 0; +} + +static int *__alloc_mbwu_array(struct mpam_class *class, u16 num_mbwu_mon) +{ + int err; + size_t array_size = num_mbwu_mon * sizeof(int); + int *array __free(kfree) = kmalloc(array_size, GFP_KERNEL); + + if (!array) + return ERR_PTR(-ENOMEM); + + memset(array, -1, array_size); + + err = __alloc_mbwu_mon(class, array, num_mbwu_mon); + if (err) + return ERR_PTR(err); + return_ptr(array); +} + static void counter_update_class(enum resctrl_event_id evt_id, struct mpam_class *class) { - struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evt_id]; + struct mpam_class *existing_class = mon->class; + u16 num_mbwu_mon = class->props.num_mbwu_mon; + int *existing_array = mon->mbwu_idx_to_mon; if (existing_class) { if (class->level == 3) { @@ -969,7 +1019,39 @@ static void counter_update_class(enum resctrl_event_id evt_id, } } - mpam_resctrl_counters[evt_id].class = class; + pr_debug("Updating event %u to use class %u\n", evt_id, class->level); + mon->class = class; + + if (evt_id == QOS_L3_OCCUP_EVENT_ID) + return; + + /* Might not need all the monitors */ + num_mbwu_mon = __mpam_monitors_free_running(num_mbwu_mon); + if (!num_mbwu_mon) { + pr_debug("Not pre-allocating free-running counters\n"); + return; + } + + /* + * This is the pre-allocated free-running monitors path. It always + * allocates one monitor per PARTID * PMG. + */ + WARN_ON_ONCE(num_mbwu_mon != resctrl_arch_system_num_rmid_idx()); + + mon->mbwu_idx_to_mon = __alloc_mbwu_array(class, num_mbwu_mon); + if (IS_ERR(mon->mbwu_idx_to_mon)) { + pr_debug("Failed to allocate MBWU array\n"); + mon->class = existing_class; + mon->mbwu_idx_to_mon = existing_array; + return; + } + + if (existing_array) { + pr_debug("Releasing previous class %u's monitors\n", + existing_class->level); + __free_mbwu_mon(existing_class, existing_array, num_mbwu_mon); + kfree(existing_array); + } } static void mpam_resctrl_pick_counters(void) -- Gitee From 1e270b69cae421b43b1c31e22d2ad7884c447444 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 15 Oct 2025 12:33:20 +0100 Subject: [PATCH 06/66] NVIDIA: SAUCE: arm_mpam: resctrl: Pre-allocate assignable monitors ANBZ: #36714 commit aa7555911398c9a25b0a29432b01d65db50e682d NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 When there are not enough monitors, MPAM is able to emulate ABMC by making a smaller number of monitors assignable. These monitors still need to be allocated from the driver, and mapped to whichever control/monitor group resctrl wants to use them with. Add a second array to hold the monitor values indexed by resctrl's cntr_id. When CDP is in use, two monitors are needed so the available number of counters halves. Platforms witih one monitor will have zero monitors when CDP is in use. Signed-off-by: James Morse (forward ported from commit d8a0ad3da1831147810bb58fc2459a6e36e26873 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; - Report returned value from mpam_resctrl_monitor_init_abmc(); ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_internal.h | 7 +++ drivers/resctrl/mpam_resctrl.c | 108 +++++++++++++++++++++++++++++--- 2 files changed, 105 insertions(+), 10 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 963f7bf74ce6..bee58b8347d3 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -418,6 +418,13 @@ struct mpam_resctrl_mon { * un-assigned (closid, rmid) are -1. */ int *mbwu_idx_to_mon; + + /* + * Array of assigned MBWU monitors, indexed by idx argument. + * When ABMC is not in use, this array can be NULL. Otherwise + * it maps idx to the allocated monitor. + */ + int *assigned_counters; }; static inline int mpam_alloc_csu_mon(struct mpam_class *class) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 82801feb5211..e8eefa69658a 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -75,6 +75,12 @@ static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); */ static bool resctrl_enabled; +/* + * L3 local/total may come from different classes - what is the number of MBWU + * 'on L3'? + */ +static unsigned int l3_num_allocated_mbwu = ~0; + /* Whether this num_mbw_mon could result in a free_running system */ static int __mpam_monitors_free_running(u16 num_mbwu_mon) { @@ -83,6 +89,15 @@ static int __mpam_monitors_free_running(u16 num_mbwu_mon) return 0; } +/* + * If l3_num_allocated_mbwu is forced below PARTID * PMG, then the counters + * are not free running, and ABMC's user-interface must be used to assign them. + */ +static bool mpam_resctrl_abmc_enabled(void) +{ + return l3_num_allocated_mbwu < resctrl_arch_system_num_rmid_idx(); +} + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -146,16 +161,6 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, return -EOPNOTSUPP; } -bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) -{ - return false; -} - -int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) -{ - return -EINVAL; -} - int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) { return -EOPNOTSUPP; @@ -193,6 +198,21 @@ static void resctrl_reset_task_closids(void) read_unlock(&tasklist_lock); } +static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *l3) +{ + l3->mon.num_mbm_cntrs = l3_num_allocated_mbwu; + if (cdp_enabled) + l3->mon.num_mbm_cntrs /= 2; + + if (l3->mon.num_mbm_cntrs) { + l3->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); + l3->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); + } else { + l3->mon.mbm_cntr_assignable = false; + l3->mon.mbm_assign_on_mkdir = false; + } +} + int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) { u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; @@ -252,6 +272,7 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current)); resctrl_reset_task_closids(); + mpam_resctrl_monitor_sync_abmc_vals(l3); for_each_possible_cpu(cpu) mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0); @@ -631,6 +652,11 @@ static bool class_has_usable_mbwu(struct mpam_class *class) return true; } + if (cprops->num_mbwu_mon) { + pr_debug("monitors usable via ABMC assignment\n"); + return true; + } + return false; } @@ -978,6 +1004,8 @@ static int __alloc_mbwu_mon(struct mpam_class *class, int *array, array[i] = mbwu_mon; } + l3_num_allocated_mbwu = min(l3_num_allocated_mbwu, num_mbwu_mon); + return 0; } @@ -1125,6 +1153,23 @@ static void mpam_resctrl_pick_counters(void) mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + if (r != &mpam_resctrl_controls[RDT_RESOURCE_L3].resctrl_res) + return false; + + return mpam_resctrl_abmc_enabled(); +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + lockdep_assert_cpus_held(); + + WARN_ON_ONCE(1); + + return 0; +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { struct mpam_class *class = res->class; @@ -1202,6 +1247,41 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +/* + * This must run after all event counters have been picked so that any free + * running counters have already been allocated. + */ +static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + size_t array_size = resctrl_arch_system_num_rmid_idx() * sizeof(int); + int *rmid_array __free(kfree) = kmalloc(array_size, GFP_KERNEL); + struct rdt_resource *l3 = &res->resctrl_res; + struct mpam_class *class = mon->class; + u16 num_mbwu_mon; + + if (mon->mbwu_idx_to_mon) { + pr_debug("monitors free running\n"); + return 0; + } + + if (!rmid_array) { + pr_debug("Failed to allocate RMID array\n"); + return -ENOMEM; + } + memset(rmid_array, -1, array_size); + + num_mbwu_mon = class->props.num_mbwu_mon; + mon->assigned_counters = __alloc_mbwu_array(mon->class, num_mbwu_mon); + if (IS_ERR(mon->assigned_counters)) + return PTR_ERR(mon->assigned_counters); + mon->mbwu_idx_to_mon = no_free_ptr(rmid_array); + + mpam_resctrl_monitor_sync_abmc_vals(l3); + + return 0; +} + static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, enum resctrl_event_id type) { @@ -1249,6 +1329,14 @@ static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, if (resctrl_enable_mon_event(type, false, 0, NULL)) l3->mon_capable = true; + switch (type) { + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + return mpam_resctrl_monitor_init_abmc(mon); + default: + return 0; + } + return 0; } -- Gitee From e746df907e05d2eefa5e08f26a6e1424c797cdbd Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Oct 2025 14:31:11 +0100 Subject: [PATCH 07/66] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Add kunit test for ABMC/CDP interactions ANBZ: #36714 commit 8453b8fc97f60289fc7e1d836bdb5e675c558096 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 ABMC exposes a fun corner case where a platform with one monitor can use ABMC for assignable counters - but not when CDP is enabled. Add some tests. Signed-off-by: James Morse (cherry picked from commit a861a0f40d75549387301244a228b519c86c063b https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/test_mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/test_mpam_resctrl.c | 62 +++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c index b93d6ad87e43..4145f057bd31 100644 --- a/drivers/resctrl/test_mpam_resctrl.c +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -296,6 +296,67 @@ static void test_percent_to_max_rounding(struct kunit *test) KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total); } +static void test_num_assignable_counters(struct kunit *test) +{ + unsigned int orig_l3_num_allocated_mbwu = l3_num_allocated_mbwu; + u32 orig_mpam_partid_max = mpam_partid_max; + u32 orig_mpam_pmg_max = mpam_pmg_max; + bool orig_cdp_enabled = cdp_enabled; + struct rdt_resource fake_l3; + + /* Force there to be some PARTID/PMG */ + mpam_partid_max = 3; + mpam_pmg_max = 1; + + cdp_enabled = false; + + /* ABMC off, CDP off */ + l3_num_allocated_mbwu = resctrl_arch_system_num_rmid_idx(); + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, resctrl_arch_system_num_rmid_idx()); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* ABMC on, CDP off */ + l3_num_allocated_mbwu = 4; + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, 4); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_assign_on_mkdir); + + cdp_enabled = true; + + /* ABMC off, CDP on */ + l3_num_allocated_mbwu = resctrl_arch_system_num_rmid_idx(); + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + + /* (value not consumed by resctrl) */ + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, resctrl_arch_system_num_rmid_idx() / 2); + + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* ABMC on, CDP on */ + l3_num_allocated_mbwu = 4; + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, 2); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* ABMC 'on', CDP on - but not enough counters */ + l3_num_allocated_mbwu = 1; + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, 0); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* Restore global variables that were messed with */ + l3_num_allocated_mbwu = orig_l3_num_allocated_mbwu; + mpam_partid_max = orig_mpam_partid_max; + mpam_pmg_max = orig_mpam_pmg_max; + cdp_enabled = orig_cdp_enabled; +} + static struct kunit_case mpam_resctrl_test_cases[] = { KUNIT_CASE(test_get_mba_granularity), KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), @@ -304,6 +365,7 @@ static struct kunit_case mpam_resctrl_test_cases[] = { KUNIT_CASE(test_percent_to_max_rounding), KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability, test_all_bwa_wd_gen_params), + KUNIT_CASE(test_num_assignable_counters), {} }; -- Gitee From a66c270253731e0024499f7a7ca17a4023d469d5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 15 Oct 2025 14:33:11 +0100 Subject: [PATCH 08/66] NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_config_cntr() for ABMC use ANBZ: #36714 commit 1c0f936ee0092e9fdda5265d8940fe92158fe5b5 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 ABMC has a helper resctrl_arch_config_cntr() for changing the mapping between 'cntr_id' and a CLOSID/RMID pair. Add the helper. For MPAM this is done by updating the mon->mbwu_idx_to_mon[] array, and as usual CDP means it needs doing in three different ways. Signed-off-by: James Morse (cherry picked from commit ce6ad9dcc0fd43bb2a7558fdae6c11e96cf2f066 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Remove empty definition of resctrl_arch_config_cntr() - Resolve struct rdt_l3_mon_domain parameter in resctrl_arch_config_cntr() ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 43 +++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index e8eefa69658a..753e11bd9d0b 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -148,12 +148,6 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d { } -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, - enum resctrl_event_id evtid, u32 rmid, u32 closid, - u32 cntr_id, bool assign) -{ -} - int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val) @@ -1153,6 +1147,43 @@ static void mpam_resctrl_pick_counters(void) mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } +static void __config_cntr(struct mpam_resctrl_mon *mon, u32 cntr_id, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, + bool assign) +{ + u32 mbwu_idx, mon_idx = resctrl_get_config_index(cntr_id, cdp_type); + + closid = resctrl_get_config_index(closid, cdp_type); + mbwu_idx = resctrl_arch_rmid_idx_encode(closid, rmid); + WARN_ON_ONCE(mon_idx > l3_num_allocated_mbwu); + + if (assign) + mon->mbwu_idx_to_mon[mbwu_idx] = mon->assigned_counters[mon_idx]; + else + mon->mbwu_idx_to_mon[mbwu_idx] = -1; +} + +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->mbwu_idx_to_mon || !mon->assigned_counters) { + pr_debug("monitor arrays not allocated\n"); + return; + } + + if (cdp_enabled) { + __config_cntr(mon, cntr_id, CDP_CODE, closid, rmid, assign); + __config_cntr(mon, cntr_id, CDP_DATA, closid, rmid, assign); + } else { + __config_cntr(mon, cntr_id, CDP_NONE, closid, rmid, assign); + } + + resctrl_arch_reset_rmid(r, d, closid, rmid, evtid); +} + bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { if (r != &mpam_resctrl_controls[RDT_RESOURCE_L3].resctrl_res) -- Gitee From 6e96fa765c79671375b0bd5bc4f80489738c6ef4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 25 Jun 2021 16:36:58 +0100 Subject: [PATCH 09/66] NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_rmid_read() and resctrl_arch_reset_rmid() ANBZ: #36714 commit 21a0edac8cc598c044161db89162f95a5dbee8e8 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 resctrl uses resctrl_arch_rmid_read() to read counters. CDP emulation means the counter may need reading in three different ways. The same goes for reset. The helpers behind the resctrl_arch_ functions will be re-used for the ABMC equivalent functions. Add the rounding helper for checking monitor values while we're here. Signed-off-by: James Morse (cherry picked from commit d45ffcb70f8a2c055b1b449b0a0780773cc5ca55 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - resctrl_arch_round_mon_val() has been defined in upstream. No need to re-define it here; - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 182 ++++++++++++++++++++++----------- 1 file changed, 123 insertions(+), 59 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 753e11bd9d0b..82c0b58bf4f2 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -137,11 +137,6 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domai { } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id eventid) -{ -} - void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid) @@ -464,12 +459,49 @@ void resctrl_arch_mon_ctx_free(struct rdt_resource *r, resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); } -static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, - enum mpam_device_features mon_type, - int mon_idx, - enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) +/* + * The rmid realloc threshold should be for the smallest cache exposed to + * resctrl. + */ +static int update_rmid_limits(struct mpam_class *class) +{ + u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); + struct mpam_props *cprops = &class->props; + struct cacheinfo *ci; + + lockdep_assert_cpus_held(); + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return 0; + + /* + * Assume cache levels are the same size for all CPUs... + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level); + if (!ci || ci->size == 0) { + pr_debug("Could not read cache size for class %u\n", + class->level); + return -EINVAL; + } + + if (!resctrl_rmid_realloc_limit || + ci->size < resctrl_rmid_realloc_limit) { + resctrl_rmid_realloc_limit = ci->size; + resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg; + } + + return 0; +} + +static int +__read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) { - struct mon_cfg cfg; + struct mon_cfg cfg = { }; if (!mpam_is_enabled()) return -EINVAL; @@ -477,18 +509,29 @@ static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_c /* Shift closid to account for CDP */ closid = resctrl_get_config_index(closid, cdp_type); + if (mon_idx == USE_PRE_ALLOCATED) { + int mbwu_idx = resctrl_arch_rmid_idx_encode(closid, rmid); + mon_idx = mon->mbwu_idx_to_mon[mbwu_idx]; + if (mon_idx == -1) { + if (mpam_resctrl_abmc_enabled()) { + /* Report Unassigned */ + return -ENOENT; + } + /* Report Unavailable */ + return -EINVAL; + } + } + + cfg.mon = mon_idx; + cfg.match_pmg = true; + cfg.partid = closid; + cfg.pmg = rmid; + if (irqs_disabled()) { /* Check if we can access this domain without an IPI */ return -EIO; } - cfg = (struct mon_cfg) { - .mon = mon_idx, - .match_pmg = true, - .partid = closid, - .pmg = rmid, - }; - return mpam_msmon_read(mon_comp, &cfg, mon_type, val); } @@ -497,29 +540,27 @@ static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component int mon_idx, u32 closid, u32 rmid, u64 *val) { if (cdp_enabled) { - u64 code_val = 0, data_val = 0; + u64 cdp_val = 0; int err; err = __read_mon(mon, mon_comp, mon_type, mon_idx, - CDP_CODE, closid, rmid, &code_val); + CDP_CODE, closid, rmid, &cdp_val); if (err) return err; err = __read_mon(mon, mon_comp, mon_type, mon_idx, - CDP_DATA, closid, rmid, &data_val); - if (err) - return err; - - *val += code_val + data_val; - return 0; + CDP_DATA, closid, rmid, &cdp_val); + if (!err) + *val += cdp_val; + return err; } return __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_NONE, closid, rmid, val); } -/* MBWU when not in ABMC mode (not supported), and CSU counters. */ -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, +/* MBWU when not in ABMC mode, and CSU counters. */ +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, void *arch_priv, u64 *val, void *arch_mon_ctx) { @@ -531,58 +572,81 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, resctrl_arch_rmid_read_context_check(); - if (!mpam_is_enabled()) - return -EINVAL; - if (eventid >= QOS_NUM_EVENTS || !mon->class) return -EINVAL; l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr); mon_comp = l3_dom->mon_comp[eventid]; - if (eventid != QOS_L3_OCCUP_EVENT_ID) + switch (eventid) { + case QOS_L3_OCCUP_EVENT_ID: + mon_type = mpam_feat_msmon_csu; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + mon_type = mpam_feat_msmon_mbwu; + break; + default: return -EINVAL; - - mon_type = mpam_feat_msmon_csu; + } return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, closid, rmid, val); } -/* - * The rmid realloc threshold should be for the smallest cache exposed to - * resctrl. - */ -static int update_rmid_limits(struct mpam_class *class) +static void __reset_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid) { - u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); - struct mpam_props *cprops = &class->props; - struct cacheinfo *ci; + struct mon_cfg cfg = { }; - lockdep_assert_cpus_held(); + if (!mpam_is_enabled()) + return; - if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) - return 0; + /* Shift closid to account for CDP */ + closid = resctrl_get_config_index(closid, cdp_type); - /* - * Assume cache levels are the same size for all CPUs... - * The check just requires any online CPU and it can't go offline as we - * hold the cpu lock. - */ - ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level); - if (!ci || ci->size == 0) { - pr_debug("Could not read cache size for class %u\n", - class->level); - return -EINVAL; + if (mon_idx == USE_PRE_ALLOCATED) { + int mbwu_idx = resctrl_arch_rmid_idx_encode(closid, rmid); + mon_idx = mon->mbwu_idx_to_mon[mbwu_idx]; } - if (!resctrl_rmid_realloc_limit || - ci->size < resctrl_rmid_realloc_limit) { - resctrl_rmid_realloc_limit = ci->size; - resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg; + if (mon_idx == -1) + return; + cfg.mon = mon_idx; + mpam_msmon_reset_mbwu(mon_comp, &cfg); +} + +static void reset_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + int mon_idx, u32 closid, u32 rmid) +{ + if (cdp_enabled) { + __reset_mon(mon, mon_comp, mon_idx, CDP_CODE, closid, rmid); + __reset_mon(mon, mon_comp, mon_idx, CDP_DATA, closid, rmid); + } else { + __reset_mon(mon, mon_comp, mon_idx, CDP_NONE, closid, rmid); } +} - return 0; +/* Called via IPI. Call with read_cpus_lock() held. */ +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id eventid) +{ + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + + if (!mpam_is_enabled()) + return; + + /* Only MBWU counters are relevant, and for supported event types. */ + if (eventid == QOS_L3_OCCUP_EVENT_ID || !mon->class) + return; + + l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_comp = l3_dom->mon_comp[eventid]; + + reset_mon_cdp_safe(mon, mon_comp, USE_PRE_ALLOCATED, closid, rmid); } static bool cache_has_usable_cpor(struct mpam_class *class) -- Gitee From 31f92c68fa20e6ce7ce8becd81fa21c8b445d1b7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 26 Aug 2025 16:05:07 +0100 Subject: [PATCH 10/66] NVIDIA: SAUCE: arm_mpam: resctrl: Add resctrl_arch_cntr_read() & resctrl_arch_reset_cntr() ANBZ: #36714 commit 07e445fb2c4a724e32073b81eb03579fd1f6c5c3 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 When used in ABMC mode, resctrl uses a different set of helpers to read and reset the counters. Add these. Signed-off-by: James Morse (cherry picked from commit 81af700d29ca8d39ed835ad1cee1ab8095517a9d https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Remove empty definitions of resctrl_arch_cntr_read() and resctrl_arch_reset_cntr() - Resolve struct rdt_l3_mon_domain parameter in resctrl_arch_cntr_read() and resctrl_arch_reset_cntr() ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 56 ++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 82c0b58bf4f2..aa260a1e1186 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -137,19 +137,6 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domai { } -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, - u32 closid, u32 rmid, int cntr_id, - enum resctrl_event_id eventid) -{ -} - -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, - u32 unused, u32 rmid, int cntr_id, - enum resctrl_event_id eventid, u64 *val) -{ - return -EOPNOTSUPP; -} - int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) { return -EOPNOTSUPP; @@ -594,6 +581,28 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, closid, rmid, val); } +/* MBWU counters when in ABMC mode */ +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, int mon_idx, + enum resctrl_event_id eventid, u64 *val) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + + if (!mpam_is_enabled()) + return -EINVAL; + + if (eventid == QOS_L3_OCCUP_EVENT_ID || !mon->class) + return -EINVAL; + + l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_comp = l3_dom->mon_comp[eventid]; + + return read_mon_cdp_safe(mon, mon_comp, mpam_feat_msmon_mbwu, mon_idx, + closid, rmid, val); +} + static void __reset_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, int mon_idx, enum resctrl_conf_type cdp_type, u32 closid, u32 rmid) @@ -649,6 +658,27 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d reset_mon_cdp_safe(mon, mon_comp, USE_PRE_ALLOCATED, closid, rmid); } +/* Reset an assigned counter */ +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + + if (!mpam_is_enabled()) + return; + + if (eventid == QOS_L3_OCCUP_EVENT_ID || !mon->class) + return; + + l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_comp = l3_dom->mon_comp[eventid]; + + reset_mon_cdp_safe(mon, mon_comp, USE_PRE_ALLOCATED, closid, rmid); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; -- Gitee From 06632ad3531a89cd5150ea8b604027d4d13de0eb Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 10 Jun 2024 17:20:48 +0100 Subject: [PATCH 11/66] NVIDIA: VR: SAUCE: fs/resctrl: Avoid a race with dom_data_exit() and closid_num_dirty_rmid[] ANBZ: #36714 commit 9fc15b5f3212093fa57276bcf39ff3a16aa6e55b NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 On MPAM systems if an error occurs the arhictecture code will call resctrl_exit(). This calls dom_data_exit() which takes the rdrgroup_mutex and kfree()s closid_num_dirty_rmid[]. It is possible that another syscall tries to access that same array in the meantime, but is blocked on the mutex. Once dom_data_exit() completes, that syscall will see a NULL pointer. Pull the IS_ENABLED() Kconfig checks into a helper and additionally check that the array has been allocated. This will cause callers to fallback to the regular CLOSID allocation strategy. Signed-off-by: James Morse (cherry picked from commit b9be9ec43910a549fb4f5eaced3bffcebc6a180e https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/monitor.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 9fd901c78dc6..65a761fb4c1e 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -113,6 +113,20 @@ static inline struct rmid_entry *__rmid_entry(u32 idx) return entry; } +static bool __has_closid_num_dirty_rmid_array(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return false; + + /* + * Avoid a race with dom_data_exit() freeing the array under + * rdtgroup_mutex. + */ + return closid_num_dirty_rmid; +} + static void limbo_release_entry(struct rmid_entry *entry) { lockdep_assert_held(&rdtgroup_mutex); @@ -120,7 +134,7 @@ static void limbo_release_entry(struct rmid_entry *entry) rmid_limbo_count--; list_add_tail(&entry->list, &rmid_free_lru); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + if (__has_closid_num_dirty_rmid_array()) closid_num_dirty_rmid[entry->closid]--; } @@ -244,7 +258,7 @@ int resctrl_find_cleanest_closid(void) lockdep_assert_held(&rdtgroup_mutex); - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + if (!__has_closid_num_dirty_rmid_array()) return -EIO; for (i = 0; i < closids_supported(); i++) { @@ -317,7 +331,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) } rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + if (__has_closid_num_dirty_rmid_array()) closid_num_dirty_rmid[entry->closid]++; } -- Gitee From ef7cc75eea068c88a40b763897a953a4660a891a Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 10 Jun 2024 17:41:58 +0100 Subject: [PATCH 12/66] NVIDIA: VR: SAUCE: fs/resctrl: Avoid a race with dom_data_exit() and rmid_ptrs[] ANBZ: #36714 commit d7c2bfdd4257e3ffd06102b0b7d38070883caf60 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 On MPAM systems if an error occurs the arhictecture code will call resctrl_exit(). This calls dom_data_exit() which takes the rdrgroup_mutex and kfree()s rmid_ptrs[]. It is possible that another syscall tries to access that same array in the meantime, but is blocked on the mutex. Once dom_data_exit() completes, that syscall will see a NULL pointer. Make __rmid_entry() return NULL in this case. Neither __check_limbo() nor free_rmid() return an error, and can silently stop their work if this occurs. dom_data_init() has only just allocated the array and still holds the lock, so __rmid_entry() should never return NULL here. Signed-off-by: James Morse (cherry picked from commit c1ac3a4e7a0d09175fb84eb7be2b7b23e8c09f09 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/monitor.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/monitor.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 65a761fb4c1e..e338b8d48405 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -98,12 +98,17 @@ unsigned int resctrl_rmid_realloc_limit; * * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code * must accept an attempt to read every index. + * + * Returns NULL if the rmid_ptrs[] array is not allocated. */ static inline struct rmid_entry *__rmid_entry(u32 idx) { struct rmid_entry *entry; u32 closid, rmid; + if (!rmid_ptrs) + return NULL; + entry = &rmid_ptrs[idx]; resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); @@ -175,6 +180,8 @@ void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free) break; entry = __rmid_entry(idx); + if (!entry) + break; if (resctrl_arch_rmid_read(r, &d->hdr, entry->closid, entry->rmid, QOS_L3_OCCUP_EVENT_ID, arch_priv, &val, arch_mon_ctx)) { @@ -353,6 +360,8 @@ void free_rmid(u32 closid, u32 rmid) return; entry = __rmid_entry(idx); + if (!entry) + return; if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); @@ -959,6 +968,7 @@ int setup_rmid_lru_list(void) idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); entry = __rmid_entry(idx); + WARN_ON_ONCE(!entry); list_del(&entry->list); return 0; -- Gitee From 239fa2ffd59daa40a13b29100b4bdf0884e089ae Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 22 Dec 2022 17:01:52 +0000 Subject: [PATCH 13/66] NVIDIA: VR: SAUCE: debugfs: Add helpers for creating cpumask entries in debugfs ANBZ: #36714 commit be63b37a935db8ac29ec0d77813ca33d792e3c54 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 debugfs has handy helpers to make a bool, integer or string available through debugfs. Add helpers to do the same for cpumasks. These are read only. CC: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 25c2e6fafcfd8044ea148672d3e6b4b29be0d756 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit be63b37a935db8ac29ec0d77813ca33d792e3c54) Signed-off-by: Jay Chen --- fs/debugfs/file.c | 64 +++++++++++++++++++++++++++++++++++++++++ include/linux/debugfs.h | 6 ++++ 2 files changed, 70 insertions(+) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index edd6aafbfbaa..6e8c15016ece 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1143,6 +1143,70 @@ void debugfs_create_str(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_str); +static ssize_t debugfs_read_file_cpumask(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dentry *dentry = F_DENTRY(file); + struct cpumask *cpumask; + char *kernel_buf; + ssize_t ret; + int len; + + ret = debugfs_file_get(dentry); + if (unlikely(ret)) + return ret; + + /* How long is a piece of string? */ + kernel_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!kernel_buf) { + debugfs_file_put(dentry); + return -ENOMEM; + } + + cpumask = (struct cpumask *)file->private_data; + len = scnprintf(kernel_buf, PAGE_SIZE, + "%*pb\n", cpumask_pr_args(cpumask)); + debugfs_file_put(dentry); + if (len + 1 >= PAGE_SIZE) { + kfree(kernel_buf); + return -EIO; + } + + ret = simple_read_from_buffer(user_buf, count, ppos, kernel_buf, len); + kfree(kernel_buf); + + return ret; +} + +static const struct file_operations fops_cpumask_ro = { + .read = debugfs_read_file_cpumask, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * debugfs_create_cpumask - create a read-only debugfs file that is used to read a cpumask + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. + */ +void debugfs_create_cpumask(const char *name, umode_t mode, + struct dentry *parent, struct cpumask *value) +{ + /* Only read-only is supported */ + WARN_ON_ONCE(mode & S_IWUGO); + + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_cpumask_ro, + &fops_cpumask_ro, &fops_cpumask_ro); +} + static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 4177c4738282..591d4b7267d8 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -202,6 +202,8 @@ void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value); void debugfs_create_str(const char *name, umode_t mode, struct dentry *parent, char **value); +void debugfs_create_cpumask(const char *name, umode_t mode, + struct dentry *parent, struct cpumask *value); struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, @@ -407,6 +409,10 @@ static inline void debugfs_create_str(const char *name, umode_t mode, char **value) { } +static inline void debugfs_create_cpumask(const char *name, umode_t mode, + struct dentry *parent, struct cpumask *value) +{ } + static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) -- Gitee From bdacc45eb976d9d265765058f6ea4b37dc40f782 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 1 Sep 2021 15:13:12 +0100 Subject: [PATCH 14/66] NVIDIA: VR: SAUCE: arm_mpam: Add debugfs entries to show the MSC/RIS the driver discovered ANBZ: #36714 commit 919bf27649a550c396b324b9e1f791718c1485ad NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Not all of MPAM is visible through the resctrl user-space interface. To make it easy to debug why certain devices were not exposed through resctrl, allow the properties of the devices to be read through debugfs. This adds an mpam directory to debugfs, and exposes the devices as well as the hierarchy that was built. Signed-off-by: James Morse (cherry picked from commit e8f0f2147103bec25b367a273abfb7b6805df914 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 136 +++++++++++++++++++++++++++++--- drivers/resctrl/mpam_internal.h | 9 +++ 2 files changed, 136 insertions(+), 9 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index c11b611b0128..9728150e6e22 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -83,6 +83,8 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable); /* When mpam is disabled, the printed reason to aid debugging */ static char *mpam_disable_reason; +static struct dentry *mpam_debugfs; + /* * Whether resctrl has been setup. Used by cpuhp in preference to * mpam_is_enabled(). The disable call after an error interrupt makes @@ -339,6 +341,8 @@ static void mpam_class_destroy(struct mpam_class *class) { lockdep_assert_held(&mpam_list_lock); + debugfs_remove_recursive(class->debugfs); + class->debugfs = NULL; list_del_rcu(&class->classes_list); add_to_garbage(class); } @@ -391,6 +395,8 @@ static void mpam_component_destroy(struct mpam_component *comp) __destroy_component_cfg(comp); + debugfs_remove_recursive(comp->debugfs); + comp->debugfs = NULL; list_del_rcu(&comp->class_list); add_to_garbage(comp); @@ -441,6 +447,8 @@ static void mpam_vmsc_destroy(struct mpam_vmsc *vmsc) lockdep_assert_held(&mpam_list_lock); + debugfs_remove_recursive(vmsc->debugfs); + vmsc->debugfs = NULL; list_del_rcu(&vmsc->comp_list); add_to_garbage(vmsc); @@ -600,6 +608,8 @@ static void mpam_ris_destroy(struct mpam_msc_ris *ris) cpumask_andnot(&class->affinity, &class->affinity, &ris->affinity); cpumask_andnot(&comp->affinity, &comp->affinity, &ris->affinity); clear_bit(ris->ris_idx, &msc->ris_idxs); + debugfs_remove_recursive(ris->debugfs); + ris->debugfs = NULL; list_del_rcu(&ris->msc_list); list_del_rcu(&ris->vmsc_list); add_to_garbage(ris); @@ -776,32 +786,32 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) /* Cache Capacity Partitioning */ if (FIELD_GET(MPAMF_IDR_HAS_CCAP_PART, ris->idr)) { - u32 ccap_features = mpam_read_partsel_reg(msc, CCAP_IDR); + ris->ccap_idr = mpam_read_partsel_reg(msc, CCAP_IDR); - props->cmax_wd = FIELD_GET(MPAMF_CCAP_IDR_CMAX_WD, ccap_features); + props->cmax_wd = FIELD_GET(MPAMF_CCAP_IDR_CMAX_WD, ris->ccap_idr); if (props->cmax_wd && - FIELD_GET(MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM, ccap_features)) + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_softlim, props); if (props->cmax_wd && - !FIELD_GET(MPAMF_CCAP_IDR_NO_CMAX, ccap_features)) + !FIELD_GET(MPAMF_CCAP_IDR_NO_CMAX, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_cmax, props); if (props->cmax_wd && - FIELD_GET(MPAMF_CCAP_IDR_HAS_CMIN, ccap_features)) + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMIN, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_cmin, props); - props->cassoc_wd = FIELD_GET(MPAMF_CCAP_IDR_CASSOC_WD, ccap_features); + props->cassoc_wd = FIELD_GET(MPAMF_CCAP_IDR_CASSOC_WD, ris->ccap_idr); if (props->cassoc_wd && - FIELD_GET(MPAMF_CCAP_IDR_HAS_CASSOC, ccap_features)) + FIELD_GET(MPAMF_CCAP_IDR_HAS_CASSOC, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_cassoc, props); } /* Cache Portion partitioning */ if (FIELD_GET(MPAMF_IDR_HAS_CPOR_PART, ris->idr)) { - u32 cpor_features = mpam_read_partsel_reg(msc, CPOR_IDR); + ris->cpor_idr = mpam_read_partsel_reg(msc, CPOR_IDR); - props->cpbm_wd = FIELD_GET(MPAMF_CPOR_IDR_CPBM_WD, cpor_features); + props->cpbm_wd = FIELD_GET(MPAMF_CPOR_IDR_CPBM_WD, ris->cpor_idr); if (props->cpbm_wd) mpam_set_feature(mpam_feat_cpor_part, props); } @@ -2005,6 +2015,9 @@ static void mpam_msc_destroy(struct mpam_msc *msc) list_del_rcu(&msc->all_msc_list); platform_set_drvdata(pdev, NULL); + debugfs_remove_recursive(msc->debugfs); + msc->debugfs = NULL; + add_to_garbage(msc); } @@ -2023,6 +2036,7 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) { int err; u32 tmp; + char name[20]; struct mpam_msc *msc; struct resource *msc_res; struct device *dev = &pdev->dev; @@ -2090,6 +2104,10 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) list_add_rcu(&msc->all_msc_list, &mpam_all_msc); platform_set_drvdata(pdev, msc); + snprintf(name, sizeof(name), "msc.%u", msc->id); + msc->debugfs = debugfs_create_dir(name, mpam_debugfs); + debugfs_create_x32("max_nrdy_usec", 0400, msc->debugfs, &msc->nrdy_usec); + return msc; } @@ -2698,6 +2716,102 @@ static int mpam_allocate_config(void) return 0; } +static void mpam_debugfs_setup_ris(struct mpam_msc_ris *ris) +{ + char name[40]; + struct dentry *d; + struct mpam_props *rprops = &ris->props; + + snprintf(name, sizeof(name), "ris.%u", ris->ris_idx); + d = debugfs_create_dir(name, ris->vmsc->msc->debugfs); + debugfs_create_x64("mpamf_idr", 0400, d, &ris->idr); + debugfs_create_x32("mpamf_cpor_idr", 0400, d, &ris->cpor_idr); + debugfs_create_x32("mpamf_ccap_idr", 0400, d, &ris->ccap_idr); + debugfs_create_ulong("features", 0400, d, &rprops->features[0]); + debugfs_create_x16("cpbm_wd", 0400, d, &rprops->cpbm_wd); + debugfs_create_x16("mbw_pbm_bits", 0400, d, &rprops->mbw_pbm_bits); + debugfs_create_x16("num_csu_mon", 0400, d, &rprops->num_csu_mon); + debugfs_create_x16("num_mbwu_mon", 0400, d, &rprops->num_mbwu_mon); + debugfs_create_cpumask("affinity", 0400, d, &ris->affinity); + ris->debugfs = d; +} + +static void mpam_debugfs_setup_vmsc(struct mpam_component *comp, + struct mpam_vmsc *vmsc) +{ + u8 ris_idx; + char name[40]; + char path[40]; + struct dentry *d; + struct mpam_msc_ris *ris; + int msc_id = vmsc->msc->id; + + snprintf(name, sizeof(name), "vmsc.%u", msc_id); + d = debugfs_create_dir(name, comp->debugfs); + debugfs_create_ulong("features", 0400, d, &vmsc->props.features[0]); + vmsc->debugfs = d; + + list_for_each_entry_rcu(ris, &vmsc->ris, vmsc_list) { + ris_idx = ris->ris_idx; + + snprintf(name, sizeof(name), "msc.%u_ris.%u", msc_id, + ris_idx); + snprintf(path, sizeof(path), "../../../msc.%u/ris.%u", + msc_id, ris_idx); + debugfs_create_symlink(name, d, path); + } +} + +static void mpam_debugfs_setup_comp(struct mpam_class *class, + struct mpam_component *comp) +{ + char name[40]; + struct dentry *d; + struct mpam_vmsc *vmsc; + + snprintf(name, sizeof(name), "comp.%u", comp->comp_id); + d = debugfs_create_dir(name, class->debugfs); + comp->debugfs = d; + + list_for_each_entry_rcu(vmsc, &comp->vmsc, comp_list) + mpam_debugfs_setup_vmsc(comp, vmsc); +} + +static void mpam_debugfs_setup(void) +{ + char name[40]; + struct dentry *d; + struct mpam_msc *msc; + struct mpam_class *class; + struct mpam_msc_ris *ris; + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(msc, &mpam_all_msc, all_msc_list) { + d = msc->debugfs; + debugfs_create_u32("fw_id", 0400, d, &msc->pdev->id); + debugfs_create_x32("iface", 0400, d, &msc->iface); + debugfs_create_x32("mpamf_iidr", 0400, d, &msc->iidr); + list_for_each_entry(ris, &msc->ris, msc_list) + mpam_debugfs_setup_ris(ris); + } + + list_for_each_entry_rcu(class, &mpam_classes, classes_list) { + snprintf(name, sizeof(name), "class.%u", class->level); + d = debugfs_create_dir(name, mpam_debugfs); + debugfs_create_ulong("features", 0400, d, &class->props.features[0]); + debugfs_create_x32("nrdy_usec", 0400, d, &class->nrdy_usec); + debugfs_create_x16("quirks", 0400, d, &class->quirks); + debugfs_create_x8("level", 0400, d, &class->level); + debugfs_create_cpumask("affinity", 0400, d, &class->affinity); + class->debugfs = d; + + list_for_each_entry_rcu(comp, &class->components, class_list) + mpam_debugfs_setup_comp(class, comp); + } +} + static void mpam_enable_once(void) { int err; @@ -2731,6 +2845,8 @@ static void mpam_enable_once(void) pr_err("Failed to allocate configuration arrays.\n"); break; } + + mpam_debugfs_setup(); } while (0); mutex_unlock(&mpam_list_lock); cpus_read_unlock(); @@ -2952,6 +3068,8 @@ static int __init mpam_msc_driver_init(void) return -EINVAL; } + mpam_debugfs = debugfs_create_dir("mpam", NULL); + return platform_driver_register(&mpam_msc_driver); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index bee58b8347d3..ff860859a91e 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -130,6 +131,8 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; + struct dentry *debugfs; + /* Values only used on some platforms for quirks */ u32 t241_id; @@ -310,6 +313,7 @@ struct mpam_class { struct ida ida_csu_mon; struct ida ida_mbwu_mon; + struct dentry *debugfs; struct mpam_garbage garbage; }; @@ -344,6 +348,7 @@ struct mpam_component { /* parent: */ struct mpam_class *class; + struct dentry *debugfs; struct mpam_garbage garbage; }; @@ -362,12 +367,15 @@ struct mpam_vmsc { /* parent: */ struct mpam_component *comp; + struct dentry *debugfs; struct mpam_garbage garbage; }; struct mpam_msc_ris { u8 ris_idx; u64 idr; + u32 cpor_idr; + u32 ccap_idr; struct mpam_props props; bool in_reset_state; @@ -385,6 +393,7 @@ struct mpam_msc_ris { /* msmon mbwu configuration is preserved over reset */ struct msmon_mbwu_state *mbwu_state; + struct dentry *debugfs; struct mpam_garbage garbage; }; -- Gitee From f4cc0ca01875d1344768fb052b4ff2bf6ba89e4f Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 29 Jul 2024 17:05:31 +0100 Subject: [PATCH 15/66] NVIDIA: VR: SAUCE: arm_mpam: Add force-disable debugfs trigger ANBZ: #36714 commit 0c6a605638f6e7c64d595632d1294e95c60e57c9 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 MPAM has an error interrupt that can be triggered by an MSC when corrupt or out of range values are seen. The hardware only needs to raise an error interrupt if the error was detected, it is also permissible for the hardware to just use the corrupt or our of range value. All the reasons to raise an error indicate a software bug. When the error interrupt is triggered, the MPAM driver attempts to reset all the CPUs back to PARTID-0 and reset PARTID-0 to be unrestricted. This is done to ensure important tasks aren't accidentally given the performance of unimportant tasks. This teardown path in the driver is hard to trigger. Add a debugfs file to poke this manually. It is expected you have to reboot to make MPAM work again after this. Signed-off-by: James Morse (cherry picked from commit 2c4e1fed02be2c50642680f9d99a1c3424e5b7b6 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 9728150e6e22..9ecff0dc83c9 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2812,6 +2812,33 @@ static void mpam_debugfs_setup(void) } } +static int mpam_force_disable_show(struct seq_file *s, void *data) +{ + seq_puts(s, "Write 1 to this file to trigger an MPAM error.\n"); + return 0; +} + +static ssize_t mpam_force_disable_write(struct file *file, + const char __user *userbuf, size_t count, + loff_t *ppos) +{ + u32 user_val; + int err; + + err = kstrtou32_from_user(userbuf, count, 10, &user_val); + if (err) + return err; + + if (user_val == 1) { + mpam_disable_reason = "debugfs trigger"; + mpam_disable(NULL); + } + + return count; +} + +DEFINE_SHOW_STORE_ATTRIBUTE(mpam_force_disable); + static void mpam_enable_once(void) { int err; @@ -2851,6 +2878,9 @@ static void mpam_enable_once(void) mutex_unlock(&mpam_list_lock); cpus_read_unlock(); + debugfs_create_file("force_disable", 0600, mpam_debugfs, NULL, + &mpam_force_disable_fops); + if (!err) { err = mpam_resctrl_setup(); if (err) -- Gitee From ee637e48f00e2321ded49d2d6117acd66700ccf2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 18 Jul 2025 12:02:57 +0100 Subject: [PATCH 16/66] NVIDIA: VR: SAUCE: arm_mpam: Expose the number of NRDY retries in debugfs ANBZ: #36714 commit e0da6a8d52cb8fdb1a2907301144e87e735264c9 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 It's really popular to tie NRDY high, and then act surprised when the OS never reads the counters, because they aren't ready. The spec obliges hardware to clear this bit automatically before the firmware advertised timeout. To make it easier to find errant hardware, count the number of retries and expose that number in debugfs. Signed-off-by: James Morse (cherry picked from commit 4fa427c7f312e037a8080dffc62663664b976905 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 5 ++++- drivers/resctrl/mpam_internal.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 9ecff0dc83c9..6df0daa8e58c 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1319,8 +1319,10 @@ static void __ris_msmon_read(void *arg) } mpam_mon_sel_unlock(msc); - if (nrdy) + if (nrdy) { + msc->nrdy_retry_count++; m->err = -EBUSY; + } if (m->err) return; @@ -2793,6 +2795,7 @@ static void mpam_debugfs_setup(void) debugfs_create_u32("fw_id", 0400, d, &msc->pdev->id); debugfs_create_x32("iface", 0400, d, &msc->iface); debugfs_create_x32("mpamf_iidr", 0400, d, &msc->iidr); + debugfs_create_x64("nrdy_retry_count", 0400, d, &msc->nrdy_retry_count); list_for_each_entry(ris, &msc->ris, msc_list) mpam_debugfs_setup_ris(ris); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index ff860859a91e..e27e96d48ce6 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -68,6 +68,7 @@ struct mpam_msc { /* Not modified after mpam_is_enabled() becomes true */ enum mpam_msc_iface iface; u32 nrdy_usec; + u64 nrdy_retry_count; cpumask_t accessibility; bool has_extd_esr; -- Gitee From 1eafd3c203f5da26405c094ff4fb6117b5dcdf90 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Aug 2025 15:43:56 +0100 Subject: [PATCH 17/66] NVIDIA: SAUCE: arm_mpam: Add resctrl_arch_round_bw() ANBZ: #36714 commit a4231552d56984b4e5da6ae7eb4c04bd2c6e927f NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Add the required hook to pre-round a userspace memory bandwidth allocation percentage value to a value acceptable to the driver backend. For MPAM, no rounding is needed because the driver has all the information necessary for rounding the value when resctrl_arch_update_one() is called. So, just "round" the value to itself here. Signed-off-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit 935611d607afe707a00b0311fdbb500b8acdd654 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `include/linux/arm_mpam.h`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- include/linux/arm_mpam.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index f92a36187a52..4ccf32fe07fd 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -5,6 +5,7 @@ #define __LINUX_ARM_MPAM_H #include +#include #include #include @@ -76,6 +77,19 @@ static inline void resctrl_arch_disable_mon(void) { } static inline void resctrl_arch_enable_alloc(void) { } static inline void resctrl_arch_disable_alloc(void) { } +struct resctrl_schema; + +struct rdt_resource; +static inline u32 resctrl_arch_round_bw(u32 val, + const struct rdt_resource *r __always_unused) +{ + /* + * Do nothing: for MPAM, resctrl_arch_update_one() has the necessary + * context to round the incoming value correctly. + */ + return val; +} + static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) { return val; -- Gitee From 711d5bb6f4085cb7f67954f2a79b33d1801597c5 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Aug 2025 15:43:55 +0100 Subject: [PATCH 18/66] NVIDIA: SAUCE: fs/resctrl,x86/resctrl: Factor mba rounding to be per-arch ANBZ: #36714 commit efea2576c13e859ef15164c5a16d012d90eed4f0 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 The control value parser for the MB resource currently coerces the memory bandwidth percentage value from userspace to be an exact multiple of the bw_gran parameter. On MPAM systems, this results in somewhat worse-than-worst-case rounding, since bw_gran is in general only an approximation to the actual hardware granularity, and the hardware bandwidth allocation control value is not natively a percentage. Allow the arch to provide its own conversion that is appropriate for the hardware, and move the existing conversion to x86. This will avoid accumulated error from rounding the value twice on MPAM systems. Clarify the documentation, but avoid overly exact promises. Clamping to bw_min and bw_max still feels generic: leave it in the core code, for now. No functional change. Signed-off-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit cabdc680e1dde14521ab2a61ff32b525b3ba334e https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- Documentation/filesystems/resctrl.rst | 7 +++---- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 6 ++++++ fs/resctrl/ctrlmondata.c | 2 +- include/linux/resctrl.h | 2 ++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index b003bed339fd..e9ff59c2e57e 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -236,12 +236,11 @@ with respect to allocation: user can request. "bandwidth_gran": - The granularity in which the memory bandwidth + The approximate granularity in which the memory bandwidth percentage is allocated. The allocated b/w percentage is rounded off to the next - control step available on the hardware. The - available bandwidth control steps are: - min_bandwidth + N * bandwidth_gran. + control step available on the hardware. The available + steps are at least as small as this value. "delay_linear": Indicates if the delay scale is linear or diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index b20e705606b8..d539e56c2b1f 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -16,9 +16,15 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include "internal.h" +u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r) +{ + return roundup(val, (unsigned long)r->membw.bw_gran); +} + int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 9a7dfc48cb2e..0c02451c687b 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -71,7 +71,7 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) return false; } - *data = roundup(bw, (unsigned long)r->membw.bw_gran); + *data = resctrl_arch_round_bw(bw, r); return true; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index b6cea5e2caf7..2c4691bb5b65 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -505,6 +505,8 @@ bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); */ int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); +u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. -- Gitee From aa18f7f448cdb9245773f63467ed4564ac86e750 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 29 Sep 2025 14:29:42 +0100 Subject: [PATCH 19/66] NVIDIA: VR: SAUCE: arm_mpam: Split the locking around the mon_sel registers ANBZ: #36714 commit 5ce93d2d841e6d49b4d4c080e698c78070f955f8 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 The MSC MON_SEL register needs to be accessed from hardirq for the overflow interrupt, and when taking an IPI to access these registers on platforms where MSC are not accesible from every CPU. This makes an irqsave spinlock the obvious lock to protect these registers. On systems with SCMI mailboxes it must be able to sleep, meaning a mutex must be used. The SCMI platforms can't support an overflow interrupt. Clearly these two can't exist for one MSC at the same time. Split the existing helper into a raw spinlock and a mutex, named inner and outer. The outer lock must be taken in an a pre-emptible context befroe the inner lock can be taken. On systems with SCMI mailboxes where the MON_SEL accesses must sleep - the inner lock will fail tobe taken if the caller is unable to sleep. This will allow callers to fail withuot having to explicitly check the interface type of each MSC. Signed-off-by: James Morse (forward ported from commit 46584f5584d0d2eb939b0ab0e43b93e6a0665096 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; - Resolve minor conflicts in `drivers/resctrl/mpam_internal.h`; - Add outer lock in mpam_cpu_offline(); - Fix outer lock issue in __allocate_component_cfg() by moving __destroy_component_cfg() outside outer lock scope; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 54 ++++++++++++++++++-------- drivers/resctrl/mpam_internal.h | 68 ++++++++++++++++++++++++--------- 2 files changed, 88 insertions(+), 34 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 6df0daa8e58c..16cc77c9f7d2 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -751,7 +751,7 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) bool can_set, can_clear; struct mpam_msc *msc = ris->vmsc->msc; - if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) return false; mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, 0) | @@ -765,7 +765,7 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) _mpam_write_monsel_reg(msc, mon_reg, 0); now = _mpam_read_monsel_reg(msc, mon_reg); can_clear = !(now & MSMON___NRDY); - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); return (!can_set || !can_clear); } @@ -889,7 +889,9 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_msmon_csu_xcl, props); /* Is NRDY hardware managed? */ + mpam_mon_sel_outer_lock(msc); hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU); + mpam_mon_sel_outer_unlock(msc); if (hw_managed) mpam_set_feature(mpam_feat_msmon_csu_hw_nrdy, props); } @@ -923,7 +925,9 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) } /* Is NRDY hardware managed? */ + mpam_mon_sel_outer_lock(msc); hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); + mpam_mon_sel_outer_unlock(msc); if (hw_managed) mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props); @@ -1217,7 +1221,7 @@ static void __ris_msmon_read(void *arg) struct mpam_msc *msc = m->ris->vmsc->msc; u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt; - if (!mpam_mon_sel_lock(msc)) { + if (!mpam_mon_sel_inner_lock(msc)) { m->err = -EIO; return; } @@ -1317,7 +1321,7 @@ static void __ris_msmon_read(void *arg) default: m->err = -EINVAL; } - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); if (nrdy) { msc->nrdy_retry_count++; @@ -1341,6 +1345,7 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) struct mpam_msc *msc = vmsc->msc; struct mpam_msc_ris *ris; + mpam_mon_sel_outer_lock(msc); list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, srcu_read_lock_held(&mpam_srcu)) { arg->ris = ris; @@ -1359,6 +1364,7 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) if (err) any_err = err; } + mpam_mon_sel_outer_unlock(msc); } return any_err; @@ -1441,18 +1447,20 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) continue; msc = vmsc->msc; + mpam_mon_sel_outer_lock(msc); list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, srcu_read_lock_held(&mpam_srcu)) { if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) continue; - if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) continue; ris->mbwu_state[ctx->mon].correction = 0; ris->mbwu_state[ctx->mon].reset_on_next_read = true; - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } + mpam_mon_sel_outer_unlock(msc); } } @@ -1653,8 +1661,11 @@ static int mpam_restore_mbwu_state(void *_ris) u64 val; struct mon_read mwbu_arg; struct mpam_msc_ris *ris = _ris; + struct mpam_msc *msc = ris->vmsc->msc; struct mpam_class *class = ris->vmsc->comp->class; + mpam_mon_sel_outer_lock(msc); + for (i = 0; i < ris->props.num_mbwu_mon; i++) { if (ris->mbwu_state[i].enabled) { mwbu_arg.ris = ris; @@ -1666,10 +1677,12 @@ static int mpam_restore_mbwu_state(void *_ris) } } + mpam_mon_sel_outer_unlock(msc); + return 0; } -/* Call with MSC cfg_lock held */ +/* Call with MSC lock and outer mon_sel lock held */ static int mpam_save_mbwu_state(void *arg) { int i; @@ -1684,7 +1697,7 @@ static int mpam_save_mbwu_state(void *arg) mbwu_state = &ris->mbwu_state[i]; cfg = &mbwu_state->cfg; - if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) return -EIO; mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, i) | @@ -1709,7 +1722,7 @@ static int mpam_save_mbwu_state(void *arg) cfg->partid = FIELD_GET(MSMON_CFG_x_FLT_PARTID, cur_flt); mbwu_state->correction += val; mbwu_state->enabled = FIELD_GET(MSMON_CFG_x_CTL_EN, cur_ctl); - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } return 0; @@ -1904,6 +1917,7 @@ static int mpam_cpu_offline(unsigned int cpu) struct mpam_msc_ris *ris; mutex_lock(&msc->cfg_lock); + mpam_mon_sel_outer_lock(msc); list_for_each_entry_srcu(ris, &msc->ris, msc_list, srcu_read_lock_held(&mpam_srcu)) { mpam_touch_msc(msc, &mpam_reset_ris, ris); @@ -1917,6 +1931,7 @@ static int mpam_cpu_offline(unsigned int cpu) if (mpam_is_enabled()) mpam_touch_msc(msc, &mpam_save_mbwu_state, ris); } + mpam_mon_sel_outer_unlock(msc); mutex_unlock(&msc->cfg_lock); } } @@ -2618,11 +2633,13 @@ static void __destroy_component_cfg(struct mpam_component *comp) list_for_each_entry(vmsc, &comp->vmsc, comp_list) { msc = vmsc->msc; - if (mpam_mon_sel_lock(msc)) { + mpam_mon_sel_outer_lock(msc); + if (mpam_mon_sel_inner_lock(msc)) { list_for_each_entry(ris, &vmsc->ris, vmsc_list) add_to_garbage(ris->mbwu_state); - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } + mpam_mon_sel_outer_unlock(msc); } } @@ -2669,6 +2686,7 @@ static int __allocate_component_cfg(struct mpam_component *comp) mpam_reset_component_cfg(comp); list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + int err = 0; struct mpam_msc *msc; struct mpam_msc_ris *ris; struct msmon_mbwu_state *mbwu_state; @@ -2677,6 +2695,7 @@ static int __allocate_component_cfg(struct mpam_component *comp) continue; msc = vmsc->msc; + mpam_mon_sel_outer_lock(msc); list_for_each_entry(ris, &vmsc->ris, vmsc_list) { if (!ris->props.num_mbwu_mon) continue; @@ -2684,17 +2703,22 @@ static int __allocate_component_cfg(struct mpam_component *comp) mbwu_state = kzalloc_objs(*ris->mbwu_state, ris->props.num_mbwu_mon); if (!mbwu_state) { - __destroy_component_cfg(comp); - return -ENOMEM; + err = -ENOMEM; + break; } init_garbage(&mbwu_state[0].garbage); - if (mpam_mon_sel_lock(msc)) { + if (mpam_mon_sel_inner_lock(msc)) { ris->mbwu_state = mbwu_state; - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } } + mpam_mon_sel_outer_unlock(msc); + if (err) { + __destroy_component_cfg(comp); + return err; + } } return 0; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index e27e96d48ce6..02966f5b3b94 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -118,16 +118,20 @@ struct mpam_msc { /* * mon_sel_lock protects access to the MSC hardware registers that are * affected by MPAMCFG_MON_SEL, and the mbwu_state. - * Access to mon_sel is needed from both process and interrupt contexts, - * but is complicated by firmware-backed platforms that can't make any - * access unless they can sleep. - * Always use the mpam_mon_sel_lock() helpers. - * Accesses to mon_sel need to be able to fail if they occur in the wrong - * context. + * Both the 'inner' and 'outer' must be taken. + * For real MMIO MSC, the outer lock is unnecessary - but keeps the + * code common with: + * Firmware backed MSC need to sleep when accessing the MSC, which + * means some code-paths will always fail. For these MSC the outer + * lock is providing the protection, and the inner lock fails to + * be taken if the task is unable to sleep. + * * If needed, take msc->probe_lock first. */ - raw_spinlock_t _mon_sel_lock; - unsigned long _mon_sel_flags; + struct mutex outer_mon_sel_lock; + bool outer_lock_held; + raw_spinlock_t inner_mon_sel_lock; + unsigned long inner_mon_sel_flags; void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; @@ -140,30 +144,56 @@ struct mpam_msc { struct mpam_garbage garbage; }; -/* Returning false here means accesses to mon_sel must fail and report an error. */ -static inline bool __must_check mpam_mon_sel_lock(struct mpam_msc *msc) +static inline bool __must_check mpam_mon_sel_inner_lock(struct mpam_msc *msc) { - /* Locking will require updating to support a firmware backed interface */ - if (WARN_ON_ONCE(msc->iface != MPAM_IFACE_MMIO)) - return false; + /* + * The outer lock may be taken by a CPU that then issues an IPI to run + * a helper that takes the inner lock. lockdep can't help us here. + */ + WARN_ON_ONCE(!READ_ONCE(msc->outer_lock_held)); + + if (msc->iface == MPAM_IFACE_MMIO) { + raw_spin_lock_irqsave(&msc->inner_mon_sel_lock, msc->inner_mon_sel_flags); + return true; + } + + /* Accesses must fail if we are not pre-emptible */ + return !!preemptible(); +} - raw_spin_lock_irqsave(&msc->_mon_sel_lock, msc->_mon_sel_flags); - return true; +static inline void mpam_mon_sel_inner_unlock(struct mpam_msc *msc) +{ + WARN_ON_ONCE(!READ_ONCE(msc->outer_lock_held)); + + if (msc->iface == MPAM_IFACE_MMIO) + raw_spin_unlock_irqrestore(&msc->inner_mon_sel_lock, msc->inner_mon_sel_flags); +} + +static inline void mpam_mon_sel_outer_lock(struct mpam_msc *msc) +{ + mutex_lock(&msc->outer_mon_sel_lock); + msc->outer_lock_held = true; } -static inline void mpam_mon_sel_unlock(struct mpam_msc *msc) +static inline void mpam_mon_sel_outer_unlock(struct mpam_msc *msc) { - raw_spin_unlock_irqrestore(&msc->_mon_sel_lock, msc->_mon_sel_flags); + msc->outer_lock_held = false; + mutex_unlock(&msc->outer_mon_sel_lock); } static inline void mpam_mon_sel_lock_held(struct mpam_msc *msc) { - lockdep_assert_held_once(&msc->_mon_sel_lock); + WARN_ON_ONCE(!READ_ONCE(msc->outer_lock_held)); + if (msc->iface == MPAM_IFACE_MMIO) + lockdep_assert_held_once(&msc->inner_mon_sel_lock); + else + lockdep_assert_preemption_enabled(); } static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc) { - raw_spin_lock_init(&msc->_mon_sel_lock); + raw_spin_lock_init(&msc->inner_mon_sel_lock); + mutex_init(&msc->outer_mon_sel_lock); } /* Bits for mpam features bitmaps */ -- Gitee From 2cf8e9d0103d5efef16c50370804d7f598354a65 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 29 Oct 2021 16:13:51 +0100 Subject: [PATCH 20/66] NVIDIA: VR: SAUCE: arm_mpam: Allow the maximum partid to be overridden from the command line ANBZ: #36714 commit 2a3468a1b843c03e7c7a104b574e6a527569bbb9 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 MPAMs bandwidth monitors are only available via resctrl if there are enough monitors for each combination of partid and pmg to have one. As it is unlikely anyone built that many monitors, allow the maximum partid the system will use to be set from the kernel command-line. With this, it should be possible for bandwidth monitors to be enabled by reducing the number of partid in use. Signed-off-by: James Morse (cherry picked from commit f12f00ec8d977b8ea8c78986ef34cd9c898e8b2b https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 38 ++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 16cc77c9f7d2..2cedf4848a2f 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,8 @@ static DEFINE_MUTEX(mpam_cpuhp_state_lock); u16 mpam_partid_max; u8 mpam_pmg_max; static bool partid_max_init, partid_max_published; +static u16 mpam_cmdline_partid_max; +static bool mpam_cmdline_partid_max_overridden; static DEFINE_SPINLOCK(partid_max_lock); /* @@ -308,6 +311,9 @@ int mpam_register_requestor(u16 partid_max, u8 pmg_max) return -EBUSY; } + if (mpam_cmdline_partid_max_overridden) + mpam_partid_max = min(mpam_cmdline_partid_max, mpam_partid_max); + return 0; } EXPORT_SYMBOL(mpam_register_requestor); @@ -3133,6 +3139,38 @@ static int __init mpam_msc_driver_init(void) /* Must occur after arm64_mpam_register_cpus() from arch_initcall() */ subsys_initcall(mpam_msc_driver_init); +static int mpam_cmdline_partid_max_set(const char *arg, + const struct kernel_param *kp) +{ + int ret; + + spin_lock(&partid_max_lock); + ret = kstrtou16(arg, 10, &mpam_cmdline_partid_max); + if (!ret) + mpam_cmdline_partid_max_overridden = true; + spin_unlock(&partid_max_lock); + + return 0; +} +static int mpam_cmdline_partid_max_get(char *buffer, + const struct kernel_param *kp) +{ + u16 val = 0xffff; + + spin_lock(&partid_max_lock); + if (mpam_cmdline_partid_max_overridden) + val = mpam_cmdline_partid_max; + spin_unlock(&partid_max_lock); + + return sprintf(buffer, "%u\n", val); +} +static const struct kernel_param_ops mpam_cmdline_partid_max_ops = { + .set = mpam_cmdline_partid_max_set, + .get = mpam_cmdline_partid_max_get, +}; +module_param_cb(partid_max, &mpam_cmdline_partid_max_ops, NULL, 0644); +MODULE_PARM_DESC(partid_max, "Override for reducing the number of PARTID."); + #ifdef CONFIG_MPAM_KUNIT_TEST #include "test_mpam_devices.c" #endif -- Gitee From 51b5a98fcb987ed60c64846757ce71833425ee16 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 1 Jul 2025 17:03:13 +0100 Subject: [PATCH 21/66] NVIDIA: VR: SAUCE: arm_mpam: Allow MSC to be forced to have an unknown location ANBZ: #36714 commit ebc43b8c83e1d67d4455e99d0bfc0b7d05c3ac3e NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 The MPAM driver discovers which MSC control which system resources from firmware tables. The MPAM resctrl picking code then attempts to export platforms that are Xeon shaped via resctrl. Occasionally, the presence of one or more MSC prevents the platform being described as Xeon shaped, and exposed via resctrl. For example with CPU-less NUMA nodes. The additional node doensn't have an L3, so can't have domain-ids exposed for the 'MB' memory bandwidth controls. In this example, some users would prefer to control bandwidth on just the CPU nodes, instead of having nothing at all. Allow users an amount of wiggle room by allowing MSC to be forced to be treated as unknown. This effectively disables parts of the MPAM functionality. Unknown MSC are not disabled, They are still probed and contribute to the system wide properties. Suggested-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit 542e79e9f52b4a9889de0c586a9db2bed5ecfa03 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 64 +++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 2 ++ 2 files changed, 66 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 2cedf4848a2f..9889219d88e0 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include #include #include +#include #include "mpam_internal.h" @@ -629,6 +631,9 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, { int err; + if (mpam_force_unknown_msc_test(msc)) + type = MPAM_CLASS_UNKNOWN; + mutex_lock(&mpam_list_lock); err = mpam_ris_create_locked(msc, ris_idx, type, class_id, component_id); @@ -3171,6 +3176,65 @@ static const struct kernel_param_ops mpam_cmdline_partid_max_ops = { module_param_cb(partid_max, &mpam_cmdline_partid_max_ops, NULL, 0644); MODULE_PARM_DESC(partid_max, "Override for reducing the number of PARTID."); +static DEFINE_XARRAY(mpam_force_unkown_msc); + +static void mpam_force_unknown_msc_add(u32 msc_id, gfp_t gfp) +{ + xa_store(&mpam_force_unkown_msc, msc_id, xa_mk_value(msc_id), gfp); +} + +bool mpam_force_unknown_msc_test(struct mpam_msc *msc) +{ + return !!xa_load(&mpam_force_unkown_msc, msc->pdev->id); +} + +static int mpam_force_unknown_msc_set(const char *_str, + const struct kernel_param *kp) +{ + int err; + u32 val; + char *tok, *iter; + char *str __free(kfree) = kstrdup(_str, GFP_KERNEL); + + iter = str; + do { + tok = strsep(&iter, ","); + err = kstrtou32(tok, 10, &val); + if (err) { + pr_err("Failed to parse commandline: %d\n", err); + break; + } + mpam_force_unknown_msc_add(val, GFP_KERNEL); + } while (iter); + + return 0; +} +static int mpam_force_unknown_msc_get(char *buffer, + const struct kernel_param *kp) +{ + unsigned long index, count = 0; + int result = 0; + void *entry; + + xa_for_each(&mpam_force_unkown_msc, index, entry) { + if (count) + result += sprintf(buffer + result, ","); + + result += sprintf(buffer + result, "%lu", index); + count += 1; + } + + result += sprintf(buffer + result, "\n"); + + return result; +} +static const struct kernel_param_ops mpam_force_unknown_msc_ops = { + .set = mpam_force_unknown_msc_set, + .get = mpam_force_unknown_msc_get, +}; +subsys_param_cb(force_unknown_msc, &mpam_force_unknown_msc_ops, NULL, 0644); +MODULE_PARM_DESC(force_unknown_msc, "Disabling a set of probed MSC."); + #ifdef CONFIG_MPAM_KUNIT_TEST #include "test_mpam_devices.c" #endif diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 02966f5b3b94..d17c7512d807 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -524,6 +524,8 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); +bool mpam_force_unknown_msc_test(struct mpam_msc *msc); + #ifdef CONFIG_RESCTRL_FS int mpam_resctrl_setup(void); void mpam_resctrl_exit(void); -- Gitee From c57573ef7db2313f15518036b572b1712bff67ac Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 15 Sep 2022 18:00:40 +0100 Subject: [PATCH 22/66] NVIDIA: VR: SAUCE: fs/resctrl: Add this_is_not_abi mount option ANBZ: #36714 commit a4370607dbe4efa78526eff2189a6d0e6f35ad19 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Some later things in the MPAM tree enable behaviour that resctrl doesn't have upstream. To make it clear to people using the out-of-tree code that they shouldn't be relying on this in user-space, add a mount option to enable this stuff. Signed-off-by: James Morse (forward ported from commit 8bd00259ac52ebb244ced984c744135e8d7f4b7d https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/rdtgroup.c`; - Call disable_abi_playground() on out path in rdt_get_tree(); ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/internal.h | 3 +++ fs/resctrl/rdtgroup.c | 63 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 24e328a5b18b..7e1067b9e326 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -43,6 +43,7 @@ struct rdt_fs_context { bool enable_mba_mbps; bool enable_debug; bool enable_hwdrc; + bool enable_abi_playground; }; static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) @@ -315,6 +316,8 @@ struct mbm_state { u32 prev_bw; }; +DECLARE_STATIC_KEY_FALSE(resctrl_abi_playground); + extern struct mutex rdtgroup_mutex; static inline const char *rdt_kn_name(const struct kernfs_node *kn) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index ec2e19a2c6fb..7305936f8a90 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -87,6 +87,9 @@ enum resctrl_event_id mba_mbps_default_event; static bool resctrl_debug; +/* Enable wacky behaviour that is not supported upstream. */ +DEFINE_STATIC_KEY_FALSE(resctrl_abi_playground); + void rdt_last_cmd_clear(void) { lockdep_assert_held(&rdtgroup_mutex); @@ -2843,6 +2846,42 @@ static void schemata_list_destroy(void) } } +static void hack_file_mode(const char *name, u16 mode) +{ + struct rftype *rfts, *rft; + int len; + + mutex_lock(&rdtgroup_mutex); + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + rft->mode = mode; + } + + mutex_unlock(&rdtgroup_mutex); +} + +static void enable_abi_playground(void) +{ + static_key_enable(&resctrl_abi_playground.key); + + /* Make the tasks file read only */ + if (IS_ENABLED(CONFIG_CGROUP_RESCTRL)) + hack_file_mode("tasks", 0444); +} + +static void disable_abi_playground(void) +{ + static_key_disable(&resctrl_abi_playground.key); + + /* Make the tasks file read/write only */ + if (IS_ENABLED(CONFIG_CGROUP_RESCTRL)) + hack_file_mode("tasks", 0644); +} + static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); @@ -2853,6 +2892,9 @@ static int rdt_get_tree(struct fs_context *fc) DO_ONCE_SLEEPABLE(resctrl_arch_pre_mount); + if (ctx->enable_abi_playground) + enable_abi_playground(); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* @@ -2960,6 +3002,10 @@ static int rdt_get_tree(struct fs_context *fc) rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); cpus_read_unlock(); + + if (ret && ctx->enable_abi_playground) + disable_abi_playground(); + return ret; } @@ -2969,6 +3015,7 @@ enum rdt_param { Opt_mba_mbps, Opt_hwdrc, Opt_debug, + Opt_not_abi_playground, nr__rdt_params }; @@ -2978,6 +3025,13 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = { fsparam_flag("mba_MBps", Opt_mba_mbps), fsparam_flag("hwdrc", Opt_hwdrc), fsparam_flag("debug", Opt_debug), + + /* + * Some of MPAM's out of tree code exposes things through resctrl + * that need much more discussion before they are considered for + * mainline. Add a mount option that can be used to hide these crimes. + */ + fsparam_flag("this_is_not_abi", Opt_not_abi_playground), {} }; @@ -3014,6 +3068,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_debug: ctx->enable_debug = true; return 0; + case Opt_not_abi_playground: + ctx->enable_abi_playground = true; + return 0; } return -EINVAL; @@ -3260,6 +3317,9 @@ static void rdt_kill_sb(struct super_block *sb) kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); cpus_read_unlock(); + + if (static_branch_unlikely(&resctrl_abi_playground)) + disable_abi_playground(); } static struct file_system_type rdt_fs_type = { @@ -4315,6 +4375,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (resctrl_debug) seq_puts(seq, ",debug"); + if (static_branch_unlikely(&resctrl_abi_playground)) + seq_puts(seq, ",this_is_not_abi"); + return 0; } -- Gitee From 09b7795b2e01eaff834097bec22d7d4fa2808587 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Sep 2021 16:45:41 +0100 Subject: [PATCH 23/66] NVIDIA: VR: SAUCE: kobject: Add kset_get_next_obj() to allow a kset to be walked ANBZ: #36714 commit d34afaf8edef2268ab1e229333d4ae8393e2ba62 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 To expose iommu_groups via the resctrl filesystem, the resctrl driver needs to be able to walk the list of iommu_groups. These are exposed via sysfs as a kset. Add kset_get_next_obj() to allow resctrl to walk the kobjects in the kset. Signed-off-by: James Morse (cherry picked from commit 10d03a8e2abf6eb69227b8674463d8a70ceb9c94 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit d34afaf8edef2268ab1e229333d4ae8393e2ba62) Signed-off-by: Jay Chen --- include/linux/kobject.h | 2 ++ lib/kobject.c | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 6f4f63d05643..3692b279cb22 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -216,6 +216,8 @@ static inline const struct kobj_type *get_ktype(const struct kobject *kobj) struct kobject *kset_find_obj(struct kset *, const char *); +struct kobject *kset_get_next_obj(struct kset *kset, struct kobject *prev); + /* The global /sys/kernel/ kobject for people to chain off of */ extern struct kobject *kernel_kobj; /* The global /sys/kernel/mm/ kobject for people to chain off of */ diff --git a/lib/kobject.c b/lib/kobject.c index 9c9ff0f5175f..518d95cce975 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -920,6 +920,27 @@ struct kobject *kset_find_obj(struct kset *kset, const char *name) } EXPORT_SYMBOL_GPL(kset_find_obj); +struct kobject *kset_get_next_obj(struct kset *kset, struct kobject *prev) +{ + struct kobject *k; + + spin_lock(&kset->list_lock); + + if (!prev) + k = list_first_entry_or_null(&kset->list, typeof(*k), entry); + else + k = list_next_entry(prev, entry); + + if (list_entry_is_head(k, &kset->list, entry)) + k = NULL; + + kobject_get(k); + spin_unlock(&kset->list_lock); + kobject_put(prev); + + return k; +} + static void kset_release(struct kobject *kobj) { struct kset *kset = container_of(kobj, struct kset, kobj); -- Gitee From aa9274897e183970bb03038850463ac92880bba1 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Sep 2021 16:19:43 +0100 Subject: [PATCH 24/66] NVIDIA: VR: SAUCE: iommu: Add helpers to retrieve iommu_groups by id or kobject ANBZ: #36714 commit 2cf654d1bcf2b987640ba21cd5f54e3c5228636b NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 ARM SMMU with MPAM support are able to mark streams of traffic with the QoS labels MPAM uses. The user-space interface for MPAM is the resctrl filesystem, which allows threads to be moved between groups, its natural to do the same for iommu_groups. The resctrl interface lists threads, so will also need to list iommu_groups, it will be necessary to walk the list of iommu_groups. To ensure this matches what user-space sees via sysfs, it is best to walk the kobjects. When making a change, resctrl will only have the id of a group. To avoid walking the list of kobjects in this case, add iommu_group_get_by_id(). Signed-off-by: James Morse (cherry picked from commit 9b7dcc8fab78bf2545b02c53add7af27c21e5e90 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit 2cf654d1bcf2b987640ba21cd5f54e3c5228636b) Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 12 ++++++++++++ 2 files changed, 46 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 61c12ba78206..6d18ada7ff30 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1110,6 +1110,40 @@ struct iommu_group *iommu_group_alloc(void) } EXPORT_SYMBOL_GPL(iommu_group_alloc); +struct iommu_group *iommu_group_get_from_kobj(struct kobject *group_kobj) +{ + struct iommu_group *group; + + if (!iommu_group_kset || !group_kobj) + return NULL; + + group = container_of(group_kobj, struct iommu_group, kobj); + + kobject_get(group->devices_kobj); + kobject_put(&group->kobj); + + return group; +} + +struct iommu_group *iommu_group_get_by_id(int id) +{ + struct kobject *group_kobj; + const char *name; + + if (!iommu_group_kset) + return NULL; + + name = kasprintf(GFP_KERNEL, "%d", id); + if (!name) + return NULL; + + group_kobj = kset_find_obj(iommu_group_kset, name); + kfree(name); + + return iommu_group_get_from_kobj(group_kobj); +} +EXPORT_SYMBOL_GPL(iommu_group_get_by_id); + /** * iommu_group_get_iommudata - retrieve iommu_data registered for a group * @group: the group diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1af1c457b8a6..478451cf0d74 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -925,6 +925,8 @@ static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) { return iommu_paging_domain_alloc_flags(dev, 0); } +struct iommu_group *iommu_group_get_from_kobj(struct kobject *group_kobj); +extern struct iommu_group *iommu_group_get_by_id(int id); extern void iommu_domain_free(struct iommu_domain *domain); extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); @@ -1241,6 +1243,16 @@ static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) return ERR_PTR(-ENODEV); } +static inline struct iommu_group *iommu_group_get_from_kobj(struct kobject *group_kobj) +{ + return NULL; +} + +static inline struct iommu_group *iommu_group_get_by_id(int id) +{ + return NULL; +} + static inline void iommu_domain_free(struct iommu_domain *domain) { } -- Gitee From 05452f6545550e8311264d0a2fd82bfe52cdc417 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 31 Jul 2023 13:10:25 +0100 Subject: [PATCH 25/66] NVIDIA: VR: SAUCE: iommu: Add helper to retrieve iommu kset ANBZ: #36714 commit 205a3429710fe164cfb8b86484d589ee781abffc NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 To walk the list of iommu groups visible in sysfs, resctrl needs access to iommu_group_kset. Expose it. Signed-off-by: James Morse (cherry picked from commit 99cc3d17db3cbf3038957b25429a98e2a6dd5a58 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit 205a3429710fe164cfb8b86484d589ee781abffc) Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 5 +++++ include/linux/iommu.h | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 6d18ada7ff30..fc88b3588fa3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1144,6 +1144,11 @@ struct iommu_group *iommu_group_get_by_id(int id) } EXPORT_SYMBOL_GPL(iommu_group_get_by_id); +struct kset *iommu_get_group_kset(void) +{ + return kset_get(iommu_group_kset); +} + /** * iommu_group_get_iommudata - retrieve iommu_data registered for a group * @group: the group diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 478451cf0d74..eaa38b58f806 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -984,6 +984,7 @@ extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); extern int iommu_group_id(struct iommu_group *group); +struct kset *iommu_get_group_kset(void); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); int iommu_set_pgtable_quirks(struct iommu_domain *domain, @@ -1411,6 +1412,11 @@ static inline int iommu_group_id(struct iommu_group *group) return -ENODEV; } +static inline struct kset *iommu_get_group_kset(void) +{ + return NULL; +} + static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks) { -- Gitee From 9c20943819c5f5c9dc355b11b30156461084abee Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 17 Sep 2021 13:19:13 +0100 Subject: [PATCH 26/66] NVIDIA: VR: SAUCE: iommu/arm-smmu-v3: Register SMMU capabilities with MPAM ANBZ: #36714 commit bfa7580f6c106c58f9b955ea678b476569cae804 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Traffic in the system can be tagged with a PARTID and PMG. Different requestors can support a different number of bits for these fields. Before MPAM can be used, the MPAM driver has to discover the minimum number of bits supported by any requestor, which affects the range of PARTID and PMG that can be used. Detect whether the SMMU supports MPAM, if it does provide the MPAM driver with the maximum PARTID and PMG values. Tested-by: Amit Singh Tomar Signed-off-by: James Morse (cherry picked from commit 254691aaeac0832fd3daa0bab0ec5ba18c93bdc2 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 28 +++++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 6 +++++ 2 files changed, 34 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index e8d7dbe495f0..ea008853d7b1 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -5009,6 +5010,29 @@ static void arm_smmu_get_httu(struct arm_smmu_device *smmu, u32 reg) hw_features, fw_features); } +static void arm_smmu_mpam_register_smmu(struct arm_smmu_device *smmu) +{ + u16 partid_max; + u8 pmg_max; + u32 reg; + + if (!IS_ENABLED(CONFIG_ARM64_MPAM)) + return; + + if (!(smmu->features & ARM_SMMU_FEAT_MPAM)) + return; + + reg = readl_relaxed(smmu->base + ARM_SMMU_MPAMIDR); + if (!reg) + return; + + partid_max = FIELD_GET(SMMU_MPAMIDR_PARTID_MAX, reg); + pmg_max = FIELD_GET(SMMU_MPAMIDR_PMG_MAX, reg); + + if (mpam_register_requestor(partid_max, pmg_max)) + smmu->features &= ~ARM_SMMU_FEAT_MPAM; +} + static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) { u32 reg; @@ -5156,6 +5180,8 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) smmu->features |= ARM_SMMU_FEAT_RANGE_INV; if (FIELD_GET(IDR3_FWB, reg)) smmu->features |= ARM_SMMU_FEAT_S2FWB; + if (FIELD_GET(IDR3_MPAM, reg)) + smmu->features |= ARM_SMMU_FEAT_MPAM; if (FIELD_GET(IDR3_BBM, reg) == 2) smmu->features |= ARM_SMMU_FEAT_BBML2; @@ -5221,6 +5247,8 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) if (arm_smmu_sva_supported(smmu)) smmu->features |= ARM_SMMU_FEAT_SVA; + arm_smmu_mpam_register_smmu(smmu); + dev_info(smmu->dev, "oas %lu-bit (features 0x%08x)\n", smmu->oas, smmu->features); return 0; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index ef42df4753ec..60f4d0385c56 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -58,6 +58,7 @@ struct arm_vsmmu; #define IDR1_SIDSIZE GENMASK(5, 0) #define ARM_SMMU_IDR3 0xc +#define IDR3_MPAM (1 << 7) #define IDR3_FWB (1 << 8) #define IDR3_RIL (1 << 10) #define IDR3_BBM GENMASK(12, 11) @@ -169,6 +170,10 @@ struct arm_vsmmu; #define ARM_SMMU_PRIQ_IRQ_CFG1 0xd8 #define ARM_SMMU_PRIQ_IRQ_CFG2 0xdc +#define ARM_SMMU_MPAMIDR 0x130 +#define SMMU_MPAMIDR_PARTID_MAX GENMASK(15, 0) +#define SMMU_MPAMIDR_PMG_MAX GENMASK(23, 16) + #define ARM_SMMU_REG_SZ 0xe00 /* Common MSI config fields */ @@ -854,6 +859,7 @@ struct arm_smmu_device { #define ARM_SMMU_FEAT_HD (1 << 22) #define ARM_SMMU_FEAT_S2FWB (1 << 23) #define ARM_SMMU_FEAT_BBML2 (1 << 24) +#define ARM_SMMU_FEAT_MPAM (1 << 25) u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) -- Gitee From b059479cc9b1b5d7e392cf3584ddc30c13581947 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 14 Sep 2021 17:57:42 +0100 Subject: [PATCH 27/66] NVIDIA: VR: SAUCE: iommu/arm-smmu-v3: Add mpam helpers to query and set state ANBZ: #36714 commit 5c6ee0c8f6b6a9593c7c43f3ef1f13ae7abb5bd5 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 To allow an iommu_group to be moved between resctrl groups as if it were a CPU thread, the mpam driver needs to be able to set the partid and pmg for the iommu_group. Use the properties in the STE, as these only apply to one stream. The MPAM driver also needs to know the maximum partid and pmg values that the SMMU can generate. This allows it to determine the system-wide common supported range of values. Add a helper to return this id register. Tested-by: Amit Singh Tomar Signed-off-by: James Morse (cherry picked from commit d847012696d29c61687420d4a6621f1f9e9bf95d https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/iommu/iommu.c`; - Resolve minor conflicts in `include/linux/iommu.h`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 92 +++++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 ++ drivers/iommu/iommu.c | 6 ++ include/linux/iommu.h | 7 ++ 4 files changed, 112 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ea008853d7b1..f6c80ec5f261 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -4370,6 +4370,96 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } +static int arm_smmu_group_set_mpam(struct iommu_group *group, u16 partid, + u8 pmg) +{ + int i; + u32 sid; + unsigned long flags; + struct arm_smmu_ste *step; + struct iommu_domain *domain; + struct arm_smmu_device *smmu; + struct arm_smmu_master *master; + struct arm_smmu_cmdq_batch cmds; + struct arm_smmu_domain *smmu_domain; + struct arm_smmu_cmdq_ent cmd = { + .opcode = CMDQ_OP_CFGI_STE, + .cfgi = { + .leaf = true, + }, + }; + struct arm_smmu_master_domain *master_domain; + + domain = iommu_get_domain_for_group(group); + smmu_domain = to_smmu_domain(domain); + if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + return -EIO; + smmu = smmu_domain->smmu; + + arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd); + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + master = master_domain->master; + + for (i = 0; i < master->num_streams; i++) { + sid = master->streams[i].id; + step = arm_smmu_get_step_for_sid(smmu, sid); + + /* These need locking if the VMSPtr is ever used */ + step->data[4] = FIELD_PREP(STRTAB_STE_4_PARTID, partid); + step->data[5] = FIELD_PREP(STRTAB_STE_5_PMG, pmg); + + cmd.cfgi.sid = sid; + arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); + } + + master->partid = partid; + master->pmg = pmg; + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + + arm_smmu_cmdq_batch_submit(smmu, &cmds); + + return 0; +} + +static int arm_smmu_group_get_mpam(struct iommu_group *group, u16 *partid, + u8 *pmg) +{ + int err = -EINVAL; + unsigned long flags; + struct iommu_domain *domain; + struct arm_smmu_master *master; + struct arm_smmu_domain *smmu_domain; + struct arm_smmu_master_domain *master_domain; + + domain = iommu_get_domain_for_group(group); + smmu_domain = to_smmu_domain(domain); + if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + return -EIO; + + if (!partid && !pmg) + return 0; + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + master = master_domain->master; + if (master) { + if (partid) + *partid = master->partid; + if (pmg) + *pmg = master->pmg; + err = 0; + } + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + + return err; +} + static const struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, @@ -4383,6 +4473,8 @@ static const struct iommu_ops arm_smmu_ops = { .device_group = arm_smmu_device_group, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, + .get_group_qos_params = arm_smmu_group_get_mpam, + .set_group_qos_params = arm_smmu_group_set_mpam, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, .get_viommu_size = arm_smmu_get_viommu_size, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 60f4d0385c56..85d47843943a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -275,6 +275,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_1_MEV (1UL << 19) #define STRTAB_STE_1_S2FWB (1UL << 25) #define STRTAB_STE_1_S1STALLD (1UL << 27) +#define STRTAB_STE_1_S1MPAM (1UL << 26) #define STRTAB_STE_1_EATS GENMASK_ULL(29, 28) #define STRTAB_STE_1_EATS_ABT 0UL @@ -305,6 +306,10 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4) +#define STRTAB_STE_4_PARTID GENMASK_ULL(31, 16) + +#define STRTAB_STE_5_PMG GENMASK_ULL(7, 0) + /* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */ #define STRTAB_STE_0_NESTING_ALLOWED \ cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \ @@ -951,6 +956,8 @@ struct arm_smmu_master { bool stall_enabled; unsigned int ssid_bits; unsigned int iopf_refcount; + u16 partid; + u8 pmg; }; /* SMMU private data for an IOMMU domain */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index fc88b3588fa3..5b646c7d97f4 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2163,6 +2163,12 @@ void iommu_domain_free(struct iommu_domain *domain) } EXPORT_SYMBOL_GPL(iommu_domain_free); +struct iommu_domain *iommu_get_domain_for_group(struct iommu_group *group) +{ + return group->domain; +} +EXPORT_SYMBOL_GPL(iommu_get_domain_for_group); + /* * Put the group's domain back to the appropriate core-owned domain - either the * standard kernel-mode DMA configuration or an all-DMA-blocked domain. diff --git a/include/linux/iommu.h b/include/linux/iommu.h index eaa38b58f806..eeaf13bc5a04 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -715,6 +715,12 @@ struct iommu_ops { struct iommu_domain *parent_domain, const struct iommu_user_data *user_data); + /* Per group IOMMU features */ + int (*get_group_qos_params)(struct iommu_group *group, u16 *partition, + u8 *perf_mon_grp); + int (*set_group_qos_params)(struct iommu_group *group, u16 partition, + u8 perf_mon_grp); + const struct iommu_domain_ops *default_domain_ops; struct module *owner; struct iommu_domain *identity_domain; @@ -933,6 +939,7 @@ extern int iommu_attach_device(struct iommu_domain *domain, extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); +extern struct iommu_domain *iommu_get_domain_for_group(struct iommu_group *group); struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, -- Gitee From 9a180174cf58fdd3b8c2334258d400cfa3e9dbbe Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 31 Jul 2023 12:07:30 +0100 Subject: [PATCH 28/66] NVIDIA: VR: SAUCE: iommu: Add helpers to get and set the QoS state ANBZ: #36714 commit 25fb0768a14b7c36ff17d0b238357ccb61ef68d9 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 To allow an iommu_group to be moved between resctrl groups as if it were a CPU thread, the mpam driver needs to be able to set the partid and pmg for the iommu_group. Add helpers that call the iommu driver's get/set methods for these parameters. Signed-off-by: James Morse (cherry picked from commit 630242d2001b19a0f214de47640202efc3d09260 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `include/linux/iommu.h`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg (cherry picked from commit 25fb0768a14b7c36ff17d0b238357ccb61ef68d9) Signed-off-by: Jay Chen --- drivers/iommu/iommu.c | 76 +++++++++++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 15 +++++++++ 2 files changed, 91 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5b646c7d97f4..95e354a98749 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4180,3 +4180,79 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) return ret; } #endif /* CONFIG_IRQ_MSI_IOMMU */ + +/* + * iommu_group_set_qos_params() - Set the QoS parameters for a group + * @group: the iommu group. + * @partition: the partition label all traffic from the group should use. + * @perf_mon_grp: the performance label all traffic from the group should use. + * + * Return: 0 on success, or an error. + */ +int iommu_group_set_qos_params(struct iommu_group *group, + u16 partition, u8 perf_mon_grp) +{ + const struct iommu_ops *ops; + struct group_device *device; + int ret; + + mutex_lock(&group->mutex); + device = list_first_entry_or_null(&group->devices, typeof(*device), + list); + if (!device) { + ret = -ENODEV; + goto out_unlock; + } + + ops = dev_iommu_ops(device->dev); + if (!ops->set_group_qos_params) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = ops->set_group_qos_params(group, partition, perf_mon_grp); + +out_unlock: + mutex_unlock(&group->mutex); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_group_set_qos_params, "IOMMUFD_INTERNAL"); + +/* + * iommu_group_get_qos_params() - Get the QoS parameters for a group + * @group: the iommu group. + * @partition: the partition label all traffic from the group uses. + * @perf_mon_grp: the performance label all traffic from the group uses. + * + * Return: 0 on success, or an error. + */ +int iommu_group_get_qos_params(struct iommu_group *group, + u16 *partition, u8 *perf_mon_grp) +{ + const struct iommu_ops *ops; + struct group_device *device; + int ret; + + mutex_lock(&group->mutex); + device = list_first_entry_or_null(&group->devices, typeof(*device), + list); + if (!device) { + ret = -ENODEV; + goto out_unlock; + } + + ops = dev_iommu_ops(device->dev); + if (!ops->get_group_qos_params) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = ops->get_group_qos_params(group, partition, perf_mon_grp); + +out_unlock: + mutex_unlock(&group->mutex); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_group_get_qos_params, "IOMMUFD_INTERNAL"); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index eeaf13bc5a04..d871b0c31568 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1220,6 +1220,10 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); ioasid_t iommu_alloc_global_pasid(struct device *dev); void iommu_free_global_pasid(ioasid_t pasid); +int iommu_group_set_qos_params(struct iommu_group *group, + u16 partition, u8 perf_mon_grp); +int iommu_group_get_qos_params(struct iommu_group *group, + u16 *partition, u8 *perf_mon_grp); /* PCI device reset functions */ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev); @@ -1562,6 +1566,17 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) } static inline void iommu_free_global_pasid(ioasid_t pasid) {} +static inline int iommu_group_set_qos_params(struct iommu_group *group, + u16 partition, u8 perf_mon_grp) +{ + return -ENODEV; +} + +static inline int iommu_group_get_qos_params(struct iommu_group *group, + u16 *partition, u8 *perf_mon_grp) +{ + return -ENODEV; +} static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) { -- Gitee From d573a331dc5061023e1486cc6d0ea51b68ebfe96 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 31 Jul 2023 09:51:04 +0100 Subject: [PATCH 29/66] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Add iommu helpers to get/set the partid and pmg ANBZ: #36714 commit 8716fdd41befde3246c03843b13c1625a7f6ceff NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 SMMU that support MPAM can be configured to use a particular partid and pmg for a stream. The assignment of an iommu_group and its corresponding streams should be done via resctrl. Add helpers similar to setting a closid/rmid on a task. We need the same shifting if the CPUs are using CDP. The SMMU only takes one partid, conceptually its always making data accesses. Signed-off-by: James Morse (cherry picked from commit af9d3e292738a626d784e463509344b3dde55880 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/Kconfig | 1 + drivers/resctrl/mpam_resctrl.c | 53 ++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index 672abea3b03c..30f455dba5aa 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -29,3 +29,4 @@ config ARM64_MPAM_RESCTRL_FS default y if ARM64_MPAM_DRIVER && RESCTRL_FS select RESCTRL_RMID_DEPENDS_ON_CLOSID select RESCTRL_ASSIGN_FIXED + select RESCTRL_IOMMU if ARM_SMMU_V3 diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index aa260a1e1186..4e6fb59fe397 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -367,6 +368,58 @@ bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) return (tsk_closid == closid) && (tsk_rmid == rmid); } +int resctrl_arch_set_iommu_closid_rmid(struct iommu_group *group, u32 closid, + u32 rmid) +{ + u16 partid; + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU)) + return 0; + + if (cdp_enabled) + partid = closid << 1; + else + partid = closid; + + return iommu_group_set_qos_params(group, partid, rmid); +} + +bool resctrl_arch_match_iommu_closid(struct iommu_group *group, u32 closid) +{ + u16 partid; + int err = iommu_group_get_qos_params(group, &partid, NULL); + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU)) + return false; + + if (err) + return false; + + if (cdp_enabled) + partid >>= 1; + + return (partid == closid); +} + +bool resctrl_arch_match_iommu_closid_rmid(struct iommu_group *group, + u32 closid, u32 rmid) +{ + u8 pmg; + u16 partid; + int err = iommu_group_get_qos_params(group, &partid, &pmg); + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU)) + return false; + + if (err) + return false; + + if (cdp_enabled) + partid >>= 1; + + return (partid == closid) && (rmid == pmg); +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) -- Gitee From ab774018fe668e0b308a2f46335a23be176be87b Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Sep 2021 17:11:58 +0100 Subject: [PATCH 30/66] NVIDIA: VR: SAUCE: fs/resctrl: Add support for assigning iommu_groups to resctrl groups ANBZ: #36714 commit 3602b35d8ea3c1e4a7bed25745399ac193d4c385 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Arm's MPAM has support for assigning devices behind an IOMMU to a control or monitor group. This can be used for device-passthrough for a VM, or user-space drivers using VFIO to ensure the device is either in the same control group as the CPU threads. Alternatively, the iommu_group may be assigned to a different control group with preferential schema values. Extend the resctrl tasks file to include iommu_groups. These appear as 'iommu_group:0', where 0 is the group number that can be found from /sys/kernel/iommu_groups/. iommu_groups can be moved between resctrl groups by writing this string in the same way as tasks are moved. No state is preserved by resctrl, an iommu_group that disappears will no longer be listed as being part of a resctrl group. A new iommu_group will appear in the default group. Add helpers to list and move iommu_groups. Architecture specific helpers are used to apply the closid/rmid to the iommu_group due to the way MPAM emulates CDP. Tested-by: Amit Singh Tomar Signed-off-by: James Morse (forward ported from commit 8a09f730ab48859282f518615f993df7f9ccba2a https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/acpi/arm64/mpam.c`; - Resolve minor conflicts in `fs/resctrl/rdtgroup.c`; - Fix guid_equal(&spec_uuid, &int_tbl_uuid) checking issue; - FIx iommu_group_put() issue in show_rdt_iommu(); ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/acpi/arm64/mpam.c | 91 +++++++++++++++++++++++++++++++-- fs/resctrl/Kconfig | 6 +++ fs/resctrl/rdtgroup.c | 104 +++++++++++++++++++++++++++++++++++++- include/linux/resctrl.h | 28 ++++++++++ 4 files changed, 225 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c index 84963a20c3e7..0bd9ff1fd978 100644 --- a/drivers/acpi/arm64/mpam.c +++ b/drivers/acpi/arm64/mpam.c @@ -95,17 +95,51 @@ static void acpi_mpam_parse_irqs(struct platform_device *pdev, res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error"); } -static int acpi_mpam_parse_resource(struct mpam_msc *msc, +#define UUID_MPAM_INTERCONNECT_TABLE "fe2bd645-033b-49e6-9479-2e0b8b21d1cd" + +struct acpi_mpam_interconnect_descriptor_table { + u8 type_uuid[16]; + u32 num_descriptors; +}; + +struct acpi_mpam_interconnect_descriptor { + u32 source_id; + u32 destination_id; + u8 link_type; + u8 reserved[3]; +}; + +static int acpi_mpam_parse_resource(struct acpi_mpam_msc_node *tbl_msc, + struct mpam_msc *msc, struct acpi_mpam_resource_node *res) { + struct acpi_mpam_interconnect_descriptor_table *tbl_int_tbl; + struct acpi_mpam_interconnect_descriptor *tbl_int; + guid_t int_tbl_uuid, spec_uuid; int level, nid; u32 cache_id; + off_t offset; + /* + * Class IDs are somewhat arbitrary, but need to be co-ordinated. + * 0-N are caches, + * 64, 65: Interconnect, but ideally these would appear between the + * classes the controls are adjacent to. + * 128: SMMU, + * 192-192+level: Memory Side Caches, nothing checks that N is a + * small number. + * 255: Memory Controllers + * + * ACPI devices would need a class id allocated based on the _HID. + * + * Classes that the mpam driver can't currently plumb into resctrl + * are registered as UNKNOWN. + */ switch (res->locator_type) { case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE: cache_id = res->locator.cache_locator.cache_reference; level = find_acpi_cache_level_from_id(cache_id); - if (level <= 0) { + if (level <= 0 || level >= 64) { pr_err_once("Bad level (%d) for cache with id %u\n", level, cache_id); return -EINVAL; } @@ -120,6 +154,57 @@ static int acpi_mpam_parse_resource(struct mpam_msc *msc, } return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY, MPAM_CLASS_ID_DEFAULT, nid); + case ACPI_MPAM_LOCATION_TYPE_SMMU: + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_UNKNOWN, + 128, res->locator.smmu_locator.smmu_interface); + case ACPI_MPAM_LOCATION_TYPE_MEMORY_CACHE: + cache_id = res->locator.mem_cache_locator.reference; + level = res->locator.mem_cache_locator.level; + if (192 + level >= 255) { + pr_err_once("Bad level (%u) for memory side cache with reference %u\n", + level, cache_id); + return -EINVAL; + } + + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE, + 192 + level, cache_id); + + case ACPI_MPAM_LOCATION_TYPE_INTERCONNECT: + /* Find the descriptor table, and check it lands in the parent msc */ + offset = res->locator.interconnect_ifc_locator.inter_connect_desc_tbl_off; + if (offset >= tbl_msc->length) { + pr_err_once("Bad offset (%lu) for interconnect descriptor on msc %u\n", + offset, tbl_msc->identifier); + return -EINVAL; + } + tbl_int_tbl = ACPI_ADD_PTR(struct acpi_mpam_interconnect_descriptor_table, + tbl_msc, offset); + guid_parse(UUID_MPAM_INTERCONNECT_TABLE, &spec_uuid); + import_guid(&int_tbl_uuid, tbl_int_tbl->type_uuid); + if (!guid_equal(&spec_uuid, &int_tbl_uuid)) { + pr_err_once("Bad UUID for interconnect descriptor on msc %u\n", + tbl_msc->identifier); + return -EINVAL; + } + + offset += sizeof(*tbl_int_tbl); + offset += tbl_int_tbl->num_descriptors * sizeof(*tbl_int); + if (offset >= tbl_msc->length) { + pr_err_once("Bad num_descriptors (%u) for interconnect descriptor on msc %u\n", + tbl_int_tbl->num_descriptors, tbl_msc->identifier); + return -EINVAL; + } + + tbl_int = ACPI_ADD_PTR(struct acpi_mpam_interconnect_descriptor, + tbl_int_tbl, sizeof(*tbl_int_tbl)); + cache_id = tbl_int->source_id; + + /* Unknown link type? */ + if (tbl_int->link_type != 0 && tbl_int->link_type == 1) + return 0; + + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_UNKNOWN, + 64 + tbl_int->link_type, cache_id); default: /* These get discovered later and are treated as unknown */ return 0; @@ -150,7 +235,7 @@ int acpi_mpam_parse_resources(struct mpam_msc *msc, return -EINVAL; } - err = acpi_mpam_parse_resource(msc, resource); + err = acpi_mpam_parse_resource(tbl_msc, msc, resource); if (err) return err; diff --git a/fs/resctrl/Kconfig b/fs/resctrl/Kconfig index 21671301bd8a..145d837c190a 100644 --- a/fs/resctrl/Kconfig +++ b/fs/resctrl/Kconfig @@ -37,3 +37,9 @@ config RESCTRL_RMID_DEPENDS_ON_CLOSID Enabled by the architecture when the RMID values depend on the CLOSID. This causes the CLOSID allocator to search for CLOSID with clean RMID. + +config RESCTRL_IOMMU + bool + help + Enabled by the architecture when some IOMMU are able to be configured + with CLOSID/RMID. diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 7305936f8a90..0a2dabfe9061 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -769,10 +770,65 @@ static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, return ret; } +static int rdtgroup_move_iommu(int iommu_group_id, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + const struct cred *cred = current_cred(); + struct iommu_group *iommu_group; + int err; + + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID)) { + rdt_last_cmd_printf("No permission to move iommu_group %d\n", + iommu_group_id); + return -EPERM; + } + + iommu_group = iommu_group_get_by_id(iommu_group_id); + if (!iommu_group) { + rdt_last_cmd_printf("No matching iommu_group %d\n", + iommu_group_id); + return -ESRCH; + } + + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_iommu_closid(iommu_group, + rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move iommu_group to different control group\n"); + err = -EINVAL; + } else { + err = resctrl_arch_set_iommu_closid_rmid(iommu_group, + rdtgrp->closid, + rdtgrp->mon.rmid); + } + + iommu_group_put(iommu_group); + + return err; +} + +static bool string_is_iommu_group(char *buf, int *val) +{ + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU) || + !static_branch_unlikely(&resctrl_abi_playground)) + return false; + + if (strlen(buf) <= strlen("iommu_group:")) + return false; + + if (strncmp(buf, "iommu_group:", strlen("iommu_group:"))) + return false; + + buf += strlen("iommu_group:"); + + return !kstrtoint(buf, 0, val); +} + static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + int iommu_group_id; + bool is_iommu; char *pid_str; int ret = 0; pid_t pid; @@ -794,7 +850,13 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, while (buf && buf[0] != '\0' && buf[0] != '\n') { pid_str = strim(strsep(&buf, ",")); - if (kstrtoint(pid_str, 0, &pid)) { + is_iommu = string_is_iommu_group(pid_str, &iommu_group_id); + if (is_iommu) { + ret = rdtgroup_move_iommu(iommu_group_id, rdtgrp, of); + if (ret) + break; + continue; + } else if (kstrtoint(pid_str, 0, &pid)) { rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); ret = -EINVAL; break; @@ -819,6 +881,44 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, return ret ?: nbytes; } +static bool iommu_matches_rdtgroup(struct iommu_group *group, struct rdtgroup *r) +{ + if (r->type == RDTCTRL_GROUP) + return resctrl_arch_match_iommu_closid(group, r->closid); + + return resctrl_arch_match_iommu_closid_rmid(group, r->closid, + r->mon.rmid); +} + +static void show_rdt_iommu(struct rdtgroup *r, struct seq_file *s) +{ + struct kset *iommu_groups; + struct iommu_group *group; + struct kobject *group_kobj = NULL; + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU) || + !static_branch_unlikely(&resctrl_abi_playground)) + return; + + iommu_groups = iommu_get_group_kset(); + + while ((group_kobj = kset_get_next_obj(iommu_groups, group_kobj))) { + /* iommu_group_get_from_kobj() wants to drop a reference */ + kobject_get(group_kobj); + + group = iommu_group_get_from_kobj(group_kobj); + if (!group) + continue; + + if (iommu_matches_rdtgroup(group, r)) + seq_printf(s, "iommu_group:%s\n", group_kobj->name); + + iommu_group_put(group); + } + + kset_put(iommu_groups); +} + static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) { struct task_struct *p, *t; @@ -833,6 +933,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) } } rcu_read_unlock(); + + show_rdt_iommu(r, s); } static int rdtgroup_tasks_show(struct kernfs_open_file *of, diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 2c4691bb5b65..2478f3f50e75 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -3,6 +3,7 @@ #define _RESCTRL_H #include +#include #include #include #include @@ -712,6 +713,7 @@ extern unsigned int resctrl_rmid_realloc_limit; int resctrl_init(void); void resctrl_exit(void); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK u64 resctrl_arch_get_prefetch_disable_bits(void); int resctrl_arch_pseudo_lock_fn(void *_plr); @@ -725,4 +727,30 @@ static inline int resctrl_arch_measure_cycles_lat_fn(void *_plr) { return 0; } static inline int resctrl_arch_measure_l2_residency(void *_plr) { return 0; } static inline int resctrl_arch_measure_l3_residency(void *_plr) { return 0; } #endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ + +/* When supported, the architecture must implement these */ +#ifdef CONFIG_RESCTRL_IOMMU +int resctrl_arch_set_iommu_closid_rmid(struct iommu_group *group, u32 closid, + u32 rmid); +bool resctrl_arch_match_iommu_closid(struct iommu_group *group, u32 closid); +bool resctrl_arch_match_iommu_closid_rmid(struct iommu_group *group, u32 closid, + u32 rmid); +#else +static inline int resctrl_arch_set_iommu_closid_rmid(struct iommu_group *group, + u32 closid, u32 rmid) +{ + return -EOPNOTSUPP; +} +static inline bool resctrl_arch_match_iommu_closid(struct iommu_group *group, + u32 closid) +{ + return false; +} +static inline bool +resctrl_arch_match_iommu_closid_rmid(struct iommu_group *group, + u32 closid, u32 rmid) +{ + return false; +} +#endif /* CONFIG_RESCTRL_IOMMU */ #endif /* _RESCTRL_H */ -- Gitee From 14cfe587c68677a48c83465026d9ce3a17ceb44d Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 15 Mar 2024 16:46:12 +0000 Subject: [PATCH 31/66] NVIDIA: SAUCE: x86/resctrl: Add stub to allow other architecture to disable monitor overflow ANBZ: #36714 commit 567de3911e8d361de75e6c8078685fe8d08dd194 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Resctrl has an overflow handler that runs on each domain every second to ensure that any overflow of the hardware counter is accounted for. MPAM can have counters as large as 63 bits, in which case there is no need to check for overflow. To allow other architectures to disable this, add a helper that reports whether counters can overflow. Signed-off-by: James Morse (cherry picked from commit 6a4360b3e0339ffc510b68d7a7d22941030f0604 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- arch/x86/include/asm/resctrl.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 575f8408a9e7..40a74a061734 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -191,6 +191,11 @@ static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx) { } +static inline bool resctrl_arch_mon_can_overflow(void) +{ + return true; +} + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else -- Gitee From 070b66c24193910d04697996375e2a8ddb5b6b7c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 15 Mar 2024 17:32:53 +0000 Subject: [PATCH 32/66] NVIDIA: SAUCE: arm_mpam: resctrl: Determine if any exposed counter can overflow ANBZ: #36714 commit 618eec86f2f583c325ae04f894343bc9577a381a NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Resctrl has an overflow handler that runs on each domain every second to ensure that any overflow of the hardware counter is accounted for. MPAM can have counters as large as 63 bits, in which case there is no need to check for overflow. To allow the overflow handler to be disabled, determine if an overflow can happen. If a class is not implemented, or has the 63bit counter, it can't overflow. Signed-off-by: James Morse (cherry picked from commit 0f6aefdf5164dd6be3bd8c6cd82b6257fadbeab2 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 21 +++++++++++++++++++++ include/linux/arm_mpam.h | 1 + 2 files changed, 22 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 4e6fb59fe397..ec602f6fd6e1 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -535,6 +535,27 @@ static int update_rmid_limits(struct mpam_class *class) return 0; } +static bool __resctrl_arch_mon_can_overflow(enum resctrl_event_id eventid) +{ + struct mpam_props *cprops; + struct mpam_class *class = mpam_resctrl_counters[eventid].class; + + if (!class) + return false; + + /* No need to worry about a 63 bit counter overflowing */ + cprops = &class->props; + return !mpam_has_feature(mpam_feat_msmon_mbwu_63counter, cprops); +} + +bool resctrl_arch_mon_can_overflow(void) +{ + if (__resctrl_arch_mon_can_overflow(QOS_L3_MBM_LOCAL_EVENT_ID)) + return true; + + return __resctrl_arch_mon_can_overflow(QOS_L3_MBM_TOTAL_EVENT_ID); +} + static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, enum mpam_device_features mon_type, diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 4ccf32fe07fd..b066d57e1a08 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -53,6 +53,7 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, bool resctrl_arch_alloc_capable(void); bool resctrl_arch_mon_capable(void); +bool resctrl_arch_mon_can_overflow(void); void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); -- Gitee From 3930ddf0239dabf858c6f998d858bcf97d051214 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 15 Mar 2024 17:36:02 +0000 Subject: [PATCH 33/66] NVIDIA: SAUCE: fs/restrl: Allow the overflow handler to be disabled ANBZ: #36714 commit 65148a97523c9e1cf6b7f6706f14f4d94aa68b1c NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Resctrl has an overflow handler that runs on each domain every second to ensure that any overflow of the hardware counter is accounted for. MPAM can have counters as large as 63 bits, in which case there is no need to check for overflow. Call the new arch helpers to determine this. Signed-off-by: James Morse (cherry picked from commit 72e375a4611a0eb5355e5a171a67a419ffd53522 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/monitor.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index e338b8d48405..94da0360952d 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -918,8 +918,10 @@ void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long del /* * When a domain comes online there is no guarantee the filesystem is * mounted. If not, there is no need to catch counter overflow. + * Some architecture may have ~64bit counters, and can ignore overflow. */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) + if (!resctrl_mounted || !resctrl_arch_mon_capable() || + !resctrl_arch_mon_can_overflow()) return; cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; -- Gitee From 488435b19108d7783dc7edb583acc6792da1237d Mon Sep 17 00:00:00 2001 From: Rex Nie Date: Mon, 11 Mar 2024 16:18:39 +0800 Subject: [PATCH 34/66] NVIDIA: VR: SAUCE: fs/resctrl: Uniform data type of component_id/domid/id/cache_id ANBZ: #36714 commit e5a4f2f5181baf01c816b2223fafaa99b41e15bc NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 This patch uniform data type of component_id/domid/id/cache_id to u32 to avoid type confusion. According to ACPI for mpam, cache id is used as locator for cache MSC. Reference to RD_PPTT_CACHE_ID definition from edk2-platforms, u32 is enough for cache_id. ( \ (((PackageId) & 0xF) << 20) | (((ClusterId) & 0xFF) << 12) | \ (((CoreId) & 0xFF) << 4) | ((CacheType) & 0xF) \ ) refs: 1. ACPI for mpam: https://developer.arm.com/documentation/den0065/latest/ 2. RD_PPTT_CACHE_ID from edk2-platforms: https://github.com/tianocore/edk2-platforms/blob/master/Platform/ARM/SgiPkg/Include/SgiAcpiHeader.h#L202 Signed-off-by: Rex Nie Signed-off-by: James Morse (cherry picked from commit 6941241fa2fd78befa42cd442507157701c98878 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `include/linux/arm_mpam.h`; - Resolve minor conflicts in `include/linux/resctrl.h`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 8 ++++---- include/linux/arm_mpam.h | 4 ++-- include/linux/resctrl.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 9889219d88e0..f1bd25b41253 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -371,7 +371,7 @@ mpam_class_find(u8 level_idx, enum mpam_class_types type) } static struct mpam_component * -mpam_component_alloc(struct mpam_class *class, int id) +mpam_component_alloc(struct mpam_class *class, u32 id) { struct mpam_component *comp; @@ -413,7 +413,7 @@ static void mpam_component_destroy(struct mpam_component *comp) } static struct mpam_component * -mpam_component_find(struct mpam_class *class, int id) +mpam_component_find(struct mpam_class *class, u32 id) { struct mpam_component *comp; @@ -539,7 +539,7 @@ static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, static int mpam_ris_create_locked(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, - int component_id) + u32 component_id) { int err; struct mpam_vmsc *vmsc; @@ -627,7 +627,7 @@ static void mpam_ris_destroy(struct mpam_msc_ris *ris) } int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, - enum mpam_class_types type, u8 class_id, int component_id) + enum mpam_class_types type, u8 class_id, u32 component_id) { int err; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index b066d57e1a08..0891f8538367 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -41,11 +41,11 @@ static inline int acpi_mpam_count_msc(void) { return -EINVAL; } #ifdef CONFIG_ARM64_MPAM_DRIVER int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, - enum mpam_class_types type, u8 class_id, int component_id); + enum mpam_class_types type, u8 class_id, u32 component_id); #else static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, - int component_id) + u32 component_id) { return -EINVAL; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 2478f3f50e75..7891fb0a0332 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -138,7 +138,7 @@ enum resctrl_domain_type { */ struct rdt_domain_hdr { struct list_head list; - int id; + u32 id; enum resctrl_domain_type type; enum resctrl_res_level rid; struct cpumask cpu_mask; -- Gitee From ba948558a1ef2ad718b45a76d69772501c04cb46 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 27 Aug 2024 15:24:08 +0100 Subject: [PATCH 35/66] NVIDIA: SAUCE: arm_mpam: Allow cmax/cmin to be configured ANBZ: #36714 commit c4f4cbd811757144ba3fca83ce261fdfb2c14b84 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 mpam_reprogram_ris_partid() always resets the CMAX/CMIN controls to their 'unrestricted' value. This prevents the controls from being configured. Add fields in struct mpam_config, and program these values when they are set in the features bitmask. Signed-off-by: James Morse (cherry picked from commit e701b2860ae2c02dc9c2015846d61838904a5b0b https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; - Resolve minor conflicts in `drivers/resctrl/mpam_internal.h`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 23 +++++++++++++++++++---- drivers/resctrl/mpam_internal.h | 4 ++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index f1bd25b41253..4ef5414ce0ae 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1635,11 +1635,25 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, if (mpam_has_feature(mpam_feat_mbw_prop, rprops)) mpam_write_partsel_reg(msc, MBW_PROP, 0); - if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) - mpam_write_partsel_reg(msc, CMAX, cmax); + if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) { + if (mpam_has_feature(mpam_feat_cmax_cmax, cfg)) { + u32 cmax_val = cfg->cmax; - if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) - mpam_write_partsel_reg(msc, CMIN, 0); + if (cfg->cmax_softlim) + cmax_val |= MPAMCFG_CMAX_SOFTLIM; + mpam_write_partsel_reg(msc, CMAX, cmax_val); + } else { + mpam_write_partsel_reg(msc, CMAX, cmax); + } + } + + if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) { + if (mpam_has_feature(mpam_feat_cmax_cmin, cfg)) { + mpam_write_partsel_reg(msc, CMIN, cfg->cmin); + } else { + mpam_write_partsel_reg(msc, CMIN, 0); + } + } if (mpam_has_feature(mpam_feat_cmax_cassoc, rprops)) mpam_write_partsel_reg(msc, CASSOC, MPAMCFG_CASSOC_CASSOC); @@ -3080,6 +3094,7 @@ static bool mpam_update_config(struct mpam_config *cfg, bool has_changes = false; maybe_update_config(cfg, mpam_feat_cpor_part, newcfg, cpbm, has_changes); + maybe_update_config(cfg, mpam_feat_cmax_cmax, newcfg, cmax, has_changes); maybe_update_config(cfg, mpam_feat_mbw_part, newcfg, mbw_pbm, has_changes); maybe_update_config(cfg, mpam_feat_mbw_max, newcfg, mbw_max, has_changes); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index d17c7512d807..407f0b2f5014 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -355,6 +355,10 @@ struct mpam_config { u32 cpbm; u32 mbw_pbm; u16 mbw_max; + u16 cmax; + u16 cmin; + + bool cmax_softlim; struct mpam_garbage garbage; }; -- Gitee From 35093616ac83f55a64f27a8ee4a5f3e3a30198a6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 11:37:26 +0000 Subject: [PATCH 36/66] NVIDIA: SAUCE: arm_mpam: Rename mbw conversion to 'fract16' for code re-use ANBZ: #36714 commit fe0cb8dd5f644ffee3bcaf4a0840f88c73ccacbc NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Functions like mbw_max_to_percent() convert a value into MPAMs 16 bit fixed point fraction format. These are not only used for memory bandwidth, but cache capcity controls too. Rename these functions to convert to/from a 'fract16', and add helpers for the specific mbw_max/cmax controls. Signed-off-by: James Morse (cherry picked from commit 738f1605fb5c796713a429214270a18ec9c5d6c3 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; - Resolve minor conflicts in `drivers/resctrl/test_mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 24 +++++++++++++++++------- drivers/resctrl/test_mpam_resctrl.c | 4 ++-- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index ec602f6fd6e1..42df14976966 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -854,14 +854,14 @@ static u32 get_mba_granularity(struct mpam_props *cprops) * * Find the nearest percentage value to the upper bound of the selected band: */ -static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +static u32 fract16_to_percent(u16 fract, u8 wd) { - u32 val = mbw_max; + u32 val = fract; - val >>= 16 - cprops->bwa_wd; + val >>= 16 - wd; val += 1; val *= MAX_MBA_BW; - val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd); + val = DIV_ROUND_CLOSEST(val, 1 << wd); return val; } @@ -876,18 +876,28 @@ static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) * percentages) and over-commit (where the total of the converted * allocations is greater than expected). */ -static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +static u16 percent_to_fract16(u8 pc, u8 wd) { u32 val = pc; - val <<= cprops->bwa_wd; + val <<= wd; val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); val = max(val, 1) - 1; - val <<= 16 - cprops->bwa_wd; + val <<= 16 - wd; return val; } +static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +{ + return fract16_to_percent(mbw_max, cprops->bwa_wd); +} + +static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +{ + return percent_to_fract16(pc, cprops->bwa_wd); +} + static u32 get_mba_min(struct mpam_props *cprops) { if (!mba_class_use_mbw_max(cprops)) { diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c index 4145f057bd31..2dd28336b3d1 100644 --- a/drivers/resctrl/test_mpam_resctrl.c +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -133,7 +133,7 @@ static void test_get_mba_granularity(struct kunit *test) KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */ } -static void test_mbw_max_to_percent(struct kunit *test) +static void test_fract16_to_percent(struct kunit *test) { const struct percent_value_case *param = test->param_value; struct percent_value_test_info res; @@ -359,7 +359,7 @@ static void test_num_assignable_counters(struct kunit *test) static struct kunit_case mpam_resctrl_test_cases[] = { KUNIT_CASE(test_get_mba_granularity), - KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_fract16_to_percent, test_percent_value_gen_params), KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params), KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params), KUNIT_CASE(test_percent_to_max_rounding), -- Gitee From a1b3f22012ad38d89e2ece3bb385b9219a23a76e Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 18 Nov 2024 18:45:50 +0000 Subject: [PATCH 37/66] NVIDIA: SAUCE: fs/resctrl: Group all the MBA specific properties in a separate struct ANBZ: #36714 commit cd4001b628f0528cbaec08e937835b970797bea1 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 struct resctrl_membw combines parameters that are related to the control value, and parameters that are specific to the MBA resource. To allow the control value parsing and management code to be re-used for other resources, it needs to be separated from the MBA resource. Add struct resctrl_mba that holds all the parameters that are specific to the MBA resource. Signed-off-by: James Morse (cherry picked from commit c1133462aa498d8b75e73b094eb91512d982e067 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- arch/x86/kernel/cpu/resctrl/core.c | 18 +++++++++--------- drivers/resctrl/mpam_resctrl.c | 4 ++-- fs/resctrl/ctrlmondata.c | 3 ++- fs/resctrl/rdtgroup.c | 18 +++++++++--------- include/linux/resctrl.h | 28 ++++++++++++++++++---------- 5 files changed, 40 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7b7171a212c7..33be69fa6ddf 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -393,21 +393,21 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) hw_res->num_closid = edx.split.cos_max + 1; max_delay = eax.split.max_delay + 1; r->membw.max_bw = MAX_MBA_BW; - r->membw.arch_needs_linear = true; + r->mba.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { - r->membw.delay_linear = true; + r->mba.delay_linear = true; r->membw.min_bw = MAX_MBA_BW - max_delay; r->membw.bw_gran = MAX_MBA_BW - max_delay; } else { if (!rdt_get_mb_table(r)) return false; - r->membw.arch_needs_linear = false; + r->mba.arch_needs_linear = false; } if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) - r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; + r->mba.throttle_mode = THREAD_THROTTLE_PER_THREAD; else - r->membw.throttle_mode = THREAD_THROTTLE_MAX; + r->mba.throttle_mode = THREAD_THROTTLE_MAX; #ifdef CONFIG_X86_CPU_RESCTRL_INTEL_HWDRC r->hwdrc_capable = hwdrc_detect_intel(); @@ -435,14 +435,14 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) r->membw.max_bw = 1 << eax; /* AMD does not use delay */ - r->membw.delay_linear = false; - r->membw.arch_needs_linear = false; + r->mba.delay_linear = false; + r->mba.arch_needs_linear = false; /* * AMD does not use memory delay throttle model to control * the allocation like Intel does. */ - r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->mba.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = 0; r->membw.bw_gran = 1; @@ -511,7 +511,7 @@ static void mba_wrmsr_amd(struct msr_param *m) */ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { - if (r->membw.delay_linear) + if (r->mba.delay_linear) return MAX_MBA_BW - bw; pr_warn_once("Non Linear delay-bw map not supported but queried\n"); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 42df14976966..25acac6f4d45 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1418,8 +1418,8 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->schema_fmt = RESCTRL_SCHEMA_RANGE; r->ctrl_scope = RESCTRL_L3_CACHE; - r->membw.delay_linear = true; - r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->mba.delay_linear = true; + r->mba.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = get_mba_min(cprops); r->membw.max_bw = MAX_MBA_BW; r->membw.bw_gran = get_mba_granularity(cprops); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 0c02451c687b..1eac8f7dc07a 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -48,7 +48,8 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) /* * Only linear delay values is supported for current Intel SKUs. */ - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { + if (r->rid == RDT_RESOURCE_MBA && + !r->mba.delay_linear && r->mba.arch_needs_linear) { rdt_last_cmd_puts("No support for non-linear MB domains\n"); return false; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 0a2dabfe9061..4e2246da6f45 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1302,7 +1302,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); struct rdt_resource *r = s->res; - seq_printf(seq, "%u\n", r->membw.delay_linear); + seq_printf(seq, "%u\n", r->mba.delay_linear); return 0; } @@ -1320,7 +1320,7 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); struct rdt_resource *r = s->res; - switch (r->membw.throttle_mode) { + switch (r->mba.throttle_mode) { case THREAD_THROTTLE_PER_THREAD: seq_puts(seq, "per-thread\n"); return 0; @@ -1657,7 +1657,7 @@ bool is_mba_sc(struct rdt_resource *r) if (r->rid != RDT_RESOURCE_MBA) return false; - return r->membw.mba_sc; + return r->mba.mba_sc; } /* @@ -2279,13 +2279,13 @@ static void thread_throttle_mode_init(void) r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); if (r_mba->alloc_capable && - r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_mba->membw.throttle_mode; + r_mba->mba.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_mba->mba.throttle_mode; r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); if (r_smba->alloc_capable && - r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_smba->membw.throttle_mode; + r_smba->mba.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_smba->mba.throttle_mode; if (throttle_mode == THREAD_THROTTLE_UNDEFINED) return; @@ -2587,7 +2587,7 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, static inline bool is_mba_linear(void) { - return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; + return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->mba.delay_linear; } static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -2645,7 +2645,7 @@ static int set_mba_sc(bool mba_sc) if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) return -EINVAL; - r->membw.mba_sc = mba_sc; + r->mba.mba_sc = mba_sc; rdtgroup_default.mba_mbps_event = mba_mbps_default_event; diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 7891fb0a0332..1d20275a39db 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -248,24 +248,30 @@ enum membw_throttle_mode { * @min_bw: Minimum memory bandwidth percentage user can request * @max_bw: Maximum memory bandwidth value, used as the reset value * @bw_gran: Granularity at which the memory bandwidth is allocated - * @delay_linear: True if memory B/W delay is in linear scale - * @arch_needs_linear: True if we can't configure non-linear resources - * @throttle_mode: Bandwidth throttling mode when threads request - * different memory bandwidths - * @mba_sc: True if MBA software controller(mba_sc) is enabled * @hwdrc: True if memory bandwidth HWDRC is enabled - * @mb_map: Mapping of memory B/W percentage to memory B/W delay */ struct resctrl_membw { u32 min_bw; u32 max_bw; u32 bw_gran; - u32 delay_linear; - bool arch_needs_linear; - enum membw_throttle_mode throttle_mode; - bool mba_sc; bool hwdrc; +}; + +/** + * struct resctrl_mba - Resource properties that are specific to the MBA resource + * @mba_sc: True if MBA software controller(mba_sc) is enabled + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + * @delay_linear: True if control is in linear scale + * @arch_needs_linear: True if we can't configure non-linear resources + * @throttle_mode: Mode when threads request different control values + */ +struct resctrl_mba { + bool mba_sc; u32 *mb_map; + bool delay_linear; + bool arch_needs_linear; + enum membw_throttle_mode throttle_mode; + }; struct resctrl_schema; @@ -317,6 +323,7 @@ struct resctrl_mon { * @mon: Monitoring related data. * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource + * @mba: Properties of the MBA resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. * @cdp_capable: Is the CDP feature available on this resource @@ -332,6 +339,7 @@ struct rdt_resource { struct resctrl_cache cache; struct resctrl_membw membw; struct resctrl_mon mon; + struct resctrl_mba mba; struct list_head ctrl_domains; struct list_head mon_domains; char *name; -- Gitee From c1f23e8813756866c94110494b07ee3bf37ce700 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 10 Sep 2024 11:33:53 +0100 Subject: [PATCH 38/66] NVIDIA: SAUCE: fs/resctrl: Abstract duplicate domain test to a helper ANBZ: #36714 commit 7bcced8cc98aeb71b0b09fbc83b9c452a12b2ffe NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 parse_cbm() and parse_bw() both test the staged config for an existing entry. These would indicate user-space has provided a schema with a duplicate domain entry. e.g: | L3:0=ffff;1=f00f;0=f00f If new parsers are added this duplicate domain test has to be duplicated. Move it to the caller. Signed-off-by: James Morse (cherry picked from commit 827c80b5ec1b14a0f3d77e12ad13a8fbbf499ccd https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/ctrlmondata.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/ctrlmondata.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 1eac8f7dc07a..48ebc0f5bafb 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -84,12 +84,6 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, u32 closid = data->closid; u32 bw_val; - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; @@ -98,6 +92,7 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, return 0; } + cfg = &d->staged_config[s->conf_type]; cfg->new_ctrl = bw_val; cfg->have_new_ctrl = true; @@ -165,12 +160,6 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, u32 closid = data->closid; u32 cbm_val; - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - /* * Cannot set up more than one pseudo-locked region in a cache * hierarchy. @@ -207,6 +196,7 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, } } + cfg = &d->staged_config[s->conf_type]; cfg->new_ctrl = cbm_val; cfg->have_new_ctrl = true; @@ -264,13 +254,18 @@ static int parse_line(char *line, struct resctrl_schema *s, dom = strim(dom); list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (d->hdr.id == dom_id) { + cfg = &d->staged_config[t]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); + return -EINVAL; + } + data.buf = dom; data.closid = rdtgrp->closid; data.mode = rdtgrp->mode; if (parse_ctrlval(&data, s, d)) return -EINVAL; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - cfg = &d->staged_config[t]; /* * In pseudo-locking setup mode and just * parsed a valid CBM that should be -- Gitee From d21ede2a9579591e9aab8f777fa96e8987df5837 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 15:02:03 +0000 Subject: [PATCH 39/66] NVIDIA: SAUCE: fs/resctrl: Move MBA supported check to parse_line() instead of parse_bw() ANBZ: #36714 commit 81444f8241e5c1a480208c32b74230903a34304e NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 MBA is only supported on platforms where the delay inserted by the control is linear. Resctrl checks the two properties provided by the arch code match each time it parses part of a new control value. This doesn't need to be done so frequently, and obscures changes to parse_bw() to abstract it for use with other control types. Move this check to the parse_line() caller so it only happens once. Signed-off-by: James Morse (cherry picked from commit 85be43b4b1214a6f88d5643a8973ec6808cec56c https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/ctrlmondata.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 48ebc0f5bafb..ec9ea0f60719 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -45,15 +45,6 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) int ret; u32 bw; - /* - * Only linear delay values is supported for current Intel SKUs. - */ - if (r->rid == RDT_RESOURCE_MBA && - !r->mba.delay_linear && r->mba.arch_needs_linear) { - rdt_last_cmd_puts("No support for non-linear MB domains\n"); - return false; - } - ret = kstrtou32(buf, 10, &bw); if (ret) { rdt_last_cmd_printf("Invalid MB value %s\n", buf); @@ -242,6 +233,15 @@ static int parse_line(char *line, struct resctrl_schema *s, return -EINVAL; } + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (r->rid == RDT_RESOURCE_MBA && + !r->mba.delay_linear && r->mba.arch_needs_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); + return -EINVAL; + } + next: if (!line || line[0] == '\0') return 0; -- Gitee From c04db99a90ae8357fb5a014d5bc11746620e612d Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 15:55:45 +0000 Subject: [PATCH 40/66] NVIDIA: SAUCE: fs/resctrl: Rename resctrl_get_default_ctrl() to include resource ANBZ: #36714 commit 4f4cdb0ccf903f58a60e89d39ba06212832c78cf NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 resctrl_get_default_ctrl() is called by both the architecture code and filesystem code to return the default value for a control. This depends on the schema format. parse_bw() doesn't bother checking the bounds it is given if the resource is in use by mba_sc. This is because the values parsed from user-space are not the same as those the control should take. To make this disparity easier to work with, a second different copy of the schema format is needed, which would need a version of resctrl_get_default_ctrl(). This would let the resctrl change the schema format presented to user-space, provided it converts it to match what the architecture code expects. Rename resctrl_get_default_ctrl() to make it clear it returns the resource default. Signed-off-by: James Morse (cherry picked from commit a4ba73c6546aaf2eb6805ad910b27c55663843e0 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- arch/x86/kernel/cpu/resctrl/core.c | 2 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- drivers/resctrl/mpam_resctrl.c | 10 +++++----- fs/resctrl/rdtgroup.c | 4 ++-- include/linux/resctrl.h | 13 ++++++++----- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 33be69fa6ddf..649b806531cd 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -564,7 +564,7 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) * For Memory Allocation: Set b/w requested to 100% */ for (i = 0; i < hw_res->num_closid; i++, dc++) - *dc = resctrl_get_default_ctrl(r); + *dc = resctrl_get_resource_default_ctrl(r); } static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 885026468440..8a017f111102 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -253,7 +253,7 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) hw_dom = resctrl_to_arch_ctrl_dom(d); for (i = 0; i < hw_res->num_closid; i++) - hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r); + hw_dom->ctrl_val[i] = resctrl_get_resource_default_ctrl(r); msr_param.dom = d; smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 25acac6f4d45..93dfbd6a61df 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1411,7 +1411,7 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) * we have configured the SMMU and GIC not to do this 'all the * bits' is the correct answer here. */ - r->cache.shareable_bits = resctrl_get_default_ctrl(r); + r->cache.shareable_bits = resctrl_get_resource_default_ctrl(r); r->alloc_capable = true; break; case RDT_RESOURCE_MBA: @@ -1562,7 +1562,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, lockdep_assert_cpus_held(); if (!mpam_is_enabled()) - return resctrl_get_default_ctrl(r); + return resctrl_get_resource_default_ctrl(r); res = container_of(r, struct mpam_resctrl_res, resctrl_res); dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); @@ -1591,12 +1591,12 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, } fallthrough; default: - return resctrl_get_default_ctrl(r); + return resctrl_get_resource_default_ctrl(r); } if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) || !mpam_has_feature(configured_by, cfg)) - return resctrl_get_default_ctrl(r); + return resctrl_get_resource_default_ctrl(r); switch (configured_by) { case mpam_feat_cpor_part: @@ -1604,7 +1604,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case mpam_feat_mbw_max: return mbw_max_to_percent(cfg->mbw_max, cprops); default: - return resctrl_get_default_ctrl(r); + return resctrl_get_resource_default_ctrl(r); } } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 4e2246da6f45..61c2edd7b5d7 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1111,7 +1111,7 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of, struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); struct rdt_resource *r = s->res; - seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); + seq_printf(seq, "%x\n", resctrl_get_resource_default_ctrl(r)); return 0; } @@ -3863,7 +3863,7 @@ static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) } cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = resctrl_get_default_ctrl(r); + cfg->new_ctrl = resctrl_get_resource_default_ctrl(r); cfg->have_new_ctrl = true; } } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 1d20275a39db..484e74cacf19 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -325,7 +325,10 @@ struct resctrl_mon { * @mon_domains: RCU list of all monitor domains for this resource * @mba: Properties of the MBA resource * @name: Name to use in "schemata" file. - * @schema_fmt: Which format string and parser is used for this schema. + * @schema_fmt: Which format control parameters should be in for this resource. + * @evt_list: List of monitoring events + * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth + * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource * @hwdrc_capable: Is the hardware Dynamic Resource Controller available * on this resource. @@ -407,11 +410,11 @@ struct resctrl_mon_config_info { void resctrl_arch_sync_cpu_closid_rmid(void *info); /** - * resctrl_get_default_ctrl() - Return the default control value for this - * resource. - * @r: The resource whose default control type is queried. + * resctrl_get_resource_default_ctrl() - Return the default control value for + * this resource. + * @r: The resource whose default control value is queried. */ -static inline u32 resctrl_get_default_ctrl(struct rdt_resource *r) +static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) { switch (r->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: -- Gitee From 705a0694a2a42bcf0f16ffb02535c5fa137fad14 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 12:21:25 +0000 Subject: [PATCH 41/66] NVIDIA: SAUCE: fs/resctrl: Add a schema format to the schema, allowing it to be different ANBZ: #36714 commit 125c77a8cfa3ccb75359b8c9f469f4eec4a366bc NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 parse_bw() doesn't bother checking the bounds it is given if the resource is in use by mba_sc. This is because the values parsed from user-space are not the same as those the control should take. To make this disparity easier to work with, a second different copy of the schema format is needed, which would need a version of resctrl_get_default_ctrl(). This would let the resctrl change the schema format presented to user-space, provided it converts it to match what the architecture code expects. Add a second schema format for use with mba_sc. The membw properties are copied and the schema version is used. When mba_sc is enabled the schema copy of these properties is modified. Signed-off-by: James Morse (cherry picked from commit 225d28eb849877c6b97dcdc466d8e1aa67978272 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/ctrlmondata.c`; - Resolve minor conflicts in `include/linux/arm_mpam.h`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 4 ++-- fs/resctrl/ctrlmondata.c | 14 ++++++------ fs/resctrl/rdtgroup.c | 26 +++++++++++++++++------ include/linux/arm_mpam.h | 4 +--- include/linux/resctrl.h | 24 ++++++++++++++++++++- 5 files changed, 52 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index d539e56c2b1f..91ce05256a00 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -20,9 +20,9 @@ #include "internal.h" -u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r) +u32 resctrl_arch_round_bw(u32 val, const struct resctrl_schema *s) { - return roundup(val, (unsigned long)r->membw.bw_gran); + return roundup(val, (unsigned long)s->membw.bw_gran); } int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index ec9ea0f60719..1e51c4a01e78 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -40,7 +40,7 @@ typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, * hardware. The allocated bandwidth percentage is rounded to the next * control step available on the hardware. */ -static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) +static bool bw_validate(char *buf, u32 *data, struct resctrl_schema *s) { int ret; u32 bw; @@ -52,18 +52,18 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) } /* Nothing else to do if software controller is enabled. */ - if (is_mba_sc(r)) { + if (is_mba_sc(s->res)) { *data = bw; return true; } - if (bw < r->membw.min_bw || bw > r->membw.max_bw) { + if (bw < s->membw.min_bw || bw > s->membw.max_bw) { rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", - bw, r->membw.min_bw, r->membw.max_bw); + bw, s->membw.min_bw, s->membw.max_bw); return false; } - *data = resctrl_arch_round_bw(bw, r); + *data = resctrl_arch_round_bw(bw, s); return true; } @@ -75,7 +75,7 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, u32 closid = data->closid; u32 bw_val; - if (!bw_validate(data->buf, &bw_val, r)) + if (!bw_validate(data->buf, &bw_val, s)) return -EINVAL; if (is_mba_sc(r)) { @@ -215,7 +215,7 @@ static int parse_line(char *line, struct resctrl_schema *s, /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - switch (r->schema_fmt) { + switch (s->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: parse_ctrlval = &parse_cbm; break; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 61c2edd7b5d7..f0a5e51f4bc1 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1109,9 +1109,8 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - seq_printf(seq, "%x\n", resctrl_get_resource_default_ctrl(r)); + seq_printf(seq, "%x\n", resctrl_get_schema_default_ctrl(s)); return 0; } @@ -1252,9 +1251,8 @@ static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - seq_printf(seq, "%u\n", r->membw.min_bw); + seq_printf(seq, "%u\n", s->membw.min_bw); return 0; } @@ -1290,9 +1288,8 @@ static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - seq_printf(seq, "%u\n", r->membw.bw_gran); + seq_printf(seq, "%u\n", s->membw.bw_gran); return 0; } @@ -2895,7 +2892,22 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type if (cl > max_name_width) max_name_width = cl; - switch (r->schema_fmt) { + s->schema_fmt = r->schema_fmt; + s->membw = r->membw; + + /* + * When mba_sc() is enabled the format used by user space is different + * to that expected by hardware. The conversion is done by + * update_mba_bw(). + */ + if (is_mba_sc(r)) { + s->schema_fmt = RESCTRL_SCHEMA_RANGE; + s->membw.min_bw = 0; + s->membw.max_bw = MBA_MAX_MBPS; + s->membw.bw_gran = 1; + } + + switch (s->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: s->fmt_str = "%d=%x"; break; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 0891f8538367..d3983af5c679 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -79,10 +79,8 @@ static inline void resctrl_arch_enable_alloc(void) { } static inline void resctrl_arch_disable_alloc(void) { } struct resctrl_schema; - -struct rdt_resource; static inline u32 resctrl_arch_round_bw(u32 val, - const struct rdt_resource *r __always_unused) + const struct resctrl_schema *s __always_unused) { /* * Do nothing: for MPAM, resctrl_arch_update_one() has the necessary diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 484e74cacf19..3577a134273a 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -364,9 +364,12 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l); * @list: Member of resctrl_schema_all. * @name: The name to use in the "schemata" file. * @fmt_str: Format string to show domain value. + * @schema_fmt: Which format string and parser is used for this schema. * @conf_type: Whether this schema is specific to code/data. * @res: The resource structure exported by the architecture to describe * the hardware that is configured by this schema. + * @membw The properties of the schema which may be different to the format + * that was specified by the resource, * @num_closid: The number of closid that can be used with this schema. When * features like CDP are enabled, this will be lower than the * hardware supports for the resource. @@ -375,8 +378,10 @@ struct resctrl_schema { struct list_head list; char name[8]; const char *fmt_str; + enum resctrl_schema_fmt schema_fmt; enum resctrl_conf_type conf_type; struct rdt_resource *res; + struct resctrl_membw membw; u32 num_closid; }; @@ -426,6 +431,23 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) return WARN_ON_ONCE(1); } +/** + * resctrl_get_schema_default_ctrl() - Return the default control value for + * this schema. + * @s: The schema whose default control value is queried. + */ +static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) +{ + switch (s->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + return resctrl_get_resource_default_ctrl(s->res); + case RESCTRL_SCHEMA_RANGE: + return s->membw.max_bw; + } + + return WARN_ON_ONCE(1); +} + /* The number of closid supported by this resource regardless of CDP */ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); @@ -517,7 +539,7 @@ bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); */ int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); -u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r); +u32 resctrl_arch_round_bw(u32 val, const struct resctrl_schema *s); /* * Update the ctrl_val and apply this config right now. -- Gitee From a3c2dab30b1f28f6448d49ce3cde2aa8577ea3cb Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 27 Sep 2024 17:59:15 +0100 Subject: [PATCH 42/66] NVIDIA: VR: SAUCE: fs/resctrl: Use schema format to check the resource is a bitmap ANBZ: #36714 commit c102337e914cf706771b2ad22157a15d34454eaf NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 rdtgroup_cbm_to_size() uses a WARN_ON_ONCE() to assert that the resource it has been passed is one of the L2 or L3 cache. This is to avoid using uninitialised bitmap properties. Updating this list for every resource that is configured by a bitmap doesn't scale. Instead change the WARN_ON_ONCE() to use the schema format the arch code requested for the resource. Signed-off-by: James Morse (cherry picked from commit 04f3b4e4e1fcd4fc02d59a4c7a27619f8abf4902 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/rdtgroup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index f0a5e51f4bc1..f74f256b4293 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1631,7 +1631,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct cacheinfo *ci; int num_b; - if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) + if (WARN_ON_ONCE(r->schema_fmt != RESCTRL_SCHEMA_BITMAP)) return size; num_b = bitmap_weight(&cbm, r->cache.cbm_len); @@ -1718,11 +1718,11 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, ctrl = resctrl_arch_get_config(r, d, closid, type); - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) - size = ctrl; - else + + if (schema->schema_fmt == RESCTRL_SCHEMA_BITMAP) size = rdtgroup_cbm_to_size(r, d, ctrl); + else + size = ctrl; } seq_printf(s, "%d=%u", d->hdr.id, size); sep = true; -- Gitee From a1cbf7f28389161ff6159540bbe240209f4c52cb Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:15:54 +0000 Subject: [PATCH 43/66] NVIDIA: SAUCE: fs/resctrl: Add specific schema types for 'range' ANBZ: #36714 commit 032e23347616b3d6638e5e3bfcf8b654e0daee15 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Resctrl allows the architecture code to specify the schema format for a control. Controls can either take a bitmap, or some kind of number. If user-space doesn't know what a control is by its name, it could be told the schema format. 'Some kind of number' isn't useful as the difference between a percentage and a value in MB/s affects how these would be programmed, even if resctrl's parsing code doesn't need to care. Add the types resctrl already has in addition to 'range'. This allows architectures to move over before 'range' is removed. These new schema formats are parsed the same, but will additionally affect which files are visible. Schema formats with a double underscore should not be considered portable between architectures, and are likely to be described to user-space as 'platform defined'. AMDs MBA resource is configured with an absolute bandwidth measured in multiples of one eighth of a GB per second. resctrl needs to be aware of this platform defined format to ensure the existing 'MB' files continue to be shown. Signed-off-by: James Morse (cherry picked from commit bb81e4805d5120058ec44f793780bdf1e775cd5a https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/ctrlmondata.c | 3 +++ fs/resctrl/rdtgroup.c | 3 +++ include/linux/resctrl.h | 12 ++++++++++++ 3 files changed, 18 insertions(+) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 1e51c4a01e78..ec925ce6c877 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -220,6 +220,9 @@ static int parse_line(char *line, struct resctrl_schema *s, parse_ctrlval = &parse_cbm; break; case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: parse_ctrlval = &parse_bw; break; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index f74f256b4293..e6877dbd28d7 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2912,6 +2912,9 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type s->fmt_str = "%d=%x"; break; case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: s->fmt_str = "%d=%u"; break; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 3577a134273a..416bc1e0c2f3 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -287,10 +287,16 @@ enum resctrl_scope { * enum resctrl_schema_fmt - The format user-space provides for a schema. * @RESCTRL_SCHEMA_BITMAP: The schema is a bitmap in hex. * @RESCTRL_SCHEMA_RANGE: The schema is a decimal number. + * @RESCTRL_SCHEMA_PERCENT: The schema is a percentage. + * @RESCTRL_SCHEMA_MBPS: The schema ia a MBps value. + * @RESCTRL_SCHEMA__AMD_MBA: The schema value is MBA for AMD platforms. */ enum resctrl_schema_fmt { RESCTRL_SCHEMA_BITMAP, RESCTRL_SCHEMA_RANGE, + RESCTRL_SCHEMA_PERCENT, + RESCTRL_SCHEMA_MBPS, + RESCTRL_SCHEMA__AMD_MBA, }; /** @@ -425,6 +431,9 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) case RESCTRL_SCHEMA_BITMAP: return BIT_MASK(r->cache.cbm_len) - 1; case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: return r->membw.max_bw; } @@ -442,6 +451,9 @@ static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) case RESCTRL_SCHEMA_BITMAP: return resctrl_get_resource_default_ctrl(s->res); case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: return s->membw.max_bw; } -- Gitee From bbe260b29cf0fa36544deb6b9029062c11dd91b9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:30:08 +0000 Subject: [PATCH 44/66] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Convert MB resource to use percentage ANBZ: #36714 commit 96636b152b152b7ecaddd081a94ea806091aebbc NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 MPAMs bandwidth controls are both exposed to resctrl as if they take a percentage. Update the schema format so that user-space can be told this is a perentage, and files that describe this control format are exposed. (e.g. min_percent) Existing variation in this area is covered by requiring user-space to know if it is running on an Intel or AMD platform. Exposing the schema format directly will avoid modifying user-space to know it is running on an MPAM or RISCV platform. MPAM can also expose bitmap controls for memory bandwidth, which may become important for use-cases in the future. These are currently converted to a percentage to fit the existing definition of the MB resource. Signed-off-by: James Morse (cherry picked from commit 2baa164d3c899703f228ad0d2e9ad7d4856203e8 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 93dfbd6a61df..d498f7aa8ce7 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1415,7 +1415,7 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->alloc_capable = true; break; case RDT_RESOURCE_MBA: - r->schema_fmt = RESCTRL_SCHEMA_RANGE; + r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->ctrl_scope = RESCTRL_L3_CACHE; r->mba.delay_linear = true; -- Gitee From 82e402ec242d4ce06dacd6d7ea23e9422964961c Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:19:37 +0000 Subject: [PATCH 45/66] NVIDIA: SAUCE: x86/resctrl: Move over to specifying MBA control formats ANBZ: #36714 commit eedef6c6a71b0cb4b4de26f4c95173531ef1ffcb NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Resctrl specifies the schema format for MB and SMBA in rdt_resources_all[]. Intel platforms take a percentage for MB, AMD platforms take an absolute value which isn't MB/s. Currently these are both treated as a 'range'. Adding support for additional types of control shows that user-space needs to be told what the control formats are. Today users of resctrl must already know if their platform is Intel or AMD to know how the MB resource will behave. The MPAM support exposes new control types that take a 'percentage'. The Intel MB resource is also configured by a percentage, so should be able to expose this to user-space. Remove the static configuration for schema_fmt in rdt_resources_all[] and specify it with the other control properties in __get_mem_config_intel() or __get_mem_config_amd(). Signed-off-by: James Morse (cherry picked from commit 3323499e5df777ad2eb10be5c7dc29ae5358c93d https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `arch/x86/kernel/cpu/resctrl/core.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 649b806531cd..02cb44561ab6 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -108,7 +108,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .name = "MB", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), - .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, [RDT_RESOURCE_SMBA] = @@ -117,7 +116,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .name = "SMBA", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), - .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, [RDT_RESOURCE_PERF_PKG] = @@ -392,6 +390,7 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; max_delay = eax.split.max_delay + 1; + r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->membw.max_bw = MAX_MBA_BW; r->mba.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { @@ -432,6 +431,7 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx); hw_res->num_closid = edx + 1; + r->schema_fmt = RESCTRL_SCHEMA__AMD_MBA; r->membw.max_bw = 1 << eax; /* AMD does not use delay */ -- Gitee From 41fa0685220217e5391876f63840b2ae7bd70b50 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:32:43 +0000 Subject: [PATCH 46/66] NVIDIA: VR: SAUCE: fs/resctrl: Remove 'range' schema format ANBZ: #36714 commit 8927d5abf17212ccc0539b47480abf594759dcbd NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Resctrl previously had a 'range' schema format that took some kind of number. This has since been split into percentage, MB/s and an AMD platform specific scheme. As range is no longer used, remove it. The last user is mba_sc which should be described as taking MB/s. Signed-off-by: James Morse (cherry picked from commit 6c8f021bc7f7070728763a9a5ddaee7d5f291099 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/ctrlmondata.c | 1 - fs/resctrl/rdtgroup.c | 3 +-- include/linux/resctrl.h | 4 ---- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index ec925ce6c877..8d00aeacc337 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -219,7 +219,6 @@ static int parse_line(char *line, struct resctrl_schema *s, case RESCTRL_SCHEMA_BITMAP: parse_ctrlval = &parse_cbm; break; - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index e6877dbd28d7..3fd143a44327 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2901,7 +2901,7 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type * update_mba_bw(). */ if (is_mba_sc(r)) { - s->schema_fmt = RESCTRL_SCHEMA_RANGE; + s->schema_fmt = RESCTRL_SCHEMA_MBPS; s->membw.min_bw = 0; s->membw.max_bw = MBA_MAX_MBPS; s->membw.bw_gran = 1; @@ -2911,7 +2911,6 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type case RESCTRL_SCHEMA_BITMAP: s->fmt_str = "%d=%x"; break; - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 416bc1e0c2f3..5cce4feabab3 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -286,14 +286,12 @@ enum resctrl_scope { /** * enum resctrl_schema_fmt - The format user-space provides for a schema. * @RESCTRL_SCHEMA_BITMAP: The schema is a bitmap in hex. - * @RESCTRL_SCHEMA_RANGE: The schema is a decimal number. * @RESCTRL_SCHEMA_PERCENT: The schema is a percentage. * @RESCTRL_SCHEMA_MBPS: The schema ia a MBps value. * @RESCTRL_SCHEMA__AMD_MBA: The schema value is MBA for AMD platforms. */ enum resctrl_schema_fmt { RESCTRL_SCHEMA_BITMAP, - RESCTRL_SCHEMA_RANGE, RESCTRL_SCHEMA_PERCENT, RESCTRL_SCHEMA_MBPS, RESCTRL_SCHEMA__AMD_MBA, @@ -430,7 +428,6 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) switch (r->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: return BIT_MASK(r->cache.cbm_len) - 1; - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: @@ -450,7 +447,6 @@ static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) switch (s->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: return resctrl_get_resource_default_ctrl(s->res); - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: -- Gitee From b352f089133b12539e6dba2907be701634602a9f Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:49:06 +0000 Subject: [PATCH 47/66] NVIDIA: SAUCE: fs/resctrl: Add additional files for percentage and bitmap controls ANBZ: #36714 commit 3ea879a9537ee9e45c1a4d5359ba6c7fd134de13 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 MPAM has cache capacity controls that effectively take a percentage. Resctrl supports percentages, but the collection of files that are exposed to describe this control belong to the MB resource. To find the minimum granularity of the percentage cache capacity controls, user-space is expected to rad the banwdidth_gran file, and know this has nothing to do with bandwidth. The only problem here is the name of the file. Add duplicates of these properties with percentage and bitmap in the name. These will be exposed based on the schema format. The existing files must remain tied to the specific resources so that they remain visible to user-space. Using the same helpers ensures the values will always be the same regardless of the file used. These files are not exposed until the new RFTYPE schema flags are set on a resource 'fflags'. Signed-off-by: James Morse (cherry picked from commit a38c11612e84a927e5b6e2dccf765291a4d498fd https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/internal.h`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/internal.h | 6 ++++++ fs/resctrl/rdtgroup.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 7e1067b9e326..0460bbbce7a1 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -253,6 +253,7 @@ struct rdtgroup { #define RFTYPE_TOP BIT(6) +/* files that are specific to a type of resource, e.g. throttle_mode */ #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) @@ -263,6 +264,11 @@ struct rdtgroup { #define RFTYPE_RES_PERF_PKG BIT(12) +/* files that are specific to a type of control, e.g. percent_min */ +#define RFTYPE_SCHEMA_BITMAP BIT(13) +#define RFTYPE_SCHEMA_PERCENT BIT(14) +#define RFTYPE_SCHEMA_MBPS BIT(15) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 3fd143a44327..b45d999ef6b8 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2033,6 +2033,13 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .seq_show = resctrl_num_mbm_cntrs_show, }, + { + .name = "bitmap_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_default_ctrl_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_BITMAP, + }, { .name = "min_cbm_bits", .mode = 0444, @@ -2040,6 +2047,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_min_cbm_bits_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, + { + .name = "bitmaps_min_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_BITMAP, + }, { .name = "shareable_bits", .mode = 0444, @@ -2061,6 +2075,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_min_bw_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, + { + .name = "percent_min", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_PERCENT, + }, { .name = "bandwidth_gran", .mode = 0444, @@ -2068,6 +2089,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_bw_gran_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, + { + .name = "percent_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_PERCENT, + }, { .name = "delay_linear", .mode = 0444, -- Gitee From e318ab04cae00027950aff1c0dc953dbe5f18445 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 16:55:39 +0000 Subject: [PATCH 48/66] NVIDIA: SAUCE: fs/resctrl: Add fflags_from_schema() for files based on schema format ANBZ: #36714 commit cbc23872375abb55a9081a73fa64dfc918da1283 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 MPAM has cache capacity controls that effectively take a percentage. Resctrl supports percentages, but the collection of files that are exposed to describe this control belong to the MB resource. New files have been added that are selected based on the schema format. Apply the flags to enable these files based on the schema format. Add a new fflags_from_schema() that is used for controls. Signed-off-by: James Morse (cherry picked from commit db005687c69b453ea63389314ba791dc9df18e1a https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/rdtgroup.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/rdtgroup.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index b45d999ef6b8..3cd9b87561de 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2532,7 +2532,35 @@ static unsigned long fflags_from_resource(struct rdt_resource *r) return RFTYPE_RES_PERF_PKG; } - return WARN_ON_ONCE(1); + return 0; +} + +static u32 fflags_from_schema(struct resctrl_schema *s) +{ + struct rdt_resource *r = s->res; + u32 fflags = 0; + + /* Some resources are configured purely from their rid */ + fflags |= fflags_from_resource(r); + if (fflags) + return fflags; + + switch (s->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + fflags |= RFTYPE_SCHEMA_BITMAP; + break; + case RESCTRL_SCHEMA_PERCENT: + fflags |= RFTYPE_SCHEMA_PERCENT; + break; + case RESCTRL_SCHEMA_MBPS: + fflags |= RFTYPE_SCHEMA_MBPS; + break; + case RESCTRL_SCHEMA__AMD_MBA: + /* No standard files are exposed */ + break; + } + + return fflags; } static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) @@ -2555,7 +2583,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; + fflags = fflags_from_schema(s) | RFTYPE_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); if (ret) goto out_destroy; -- Gitee From 3e85ece36aa776e8c205586ee489d337b392bd16 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 10 Sep 2024 18:13:37 +0100 Subject: [PATCH 49/66] NVIDIA: SAUCE: fs/resctrl: Expose the schema format to user-space ANBZ: #36714 commit 6abd323412e2607722bceb3282c9bae3993ebad7 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 If more schemas are added to resctrl, user-space needs to know how to configure them. To allow user-space to configure schema it doesn't know about, it would be helpful to tell user-space the format, e.g. percentage. Add a file under info that describes the schema format. Percentages and 'mbps' are implicitly decimal, bitmaps are expected to be in hex. Signed-off-by: James Morse (cherry picked from commit f0ae6915fc22fa0a7affd46f61e0fe4a7673df06 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/rdtgroup.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/rdtgroup.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 3cd9b87561de..829598af0549 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1792,6 +1792,30 @@ static int mbm_local_bytes_config_show(struct kernfs_open_file *of, return 0; } +static int resctrl_schema_format_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + + switch (s->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + seq_puts(seq, "bitmap\n"); + break; + case RESCTRL_SCHEMA_PERCENT: + seq_puts(seq, "percentage\n"); + break; + case RESCTRL_SCHEMA_MBPS: + seq_puts(seq, "mbps\n"); + break; + /* The way these schema behave isn't discoverable from resctrl */ + case RESCTRL_SCHEMA__AMD_MBA: + seq_puts(seq, "platform\n"); + break; + } + + return 0; +} + static void mbm_config_write_domain(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 evtid, u32 val) { @@ -2248,6 +2272,14 @@ static struct rftype res_common_files[] = { .seq_show = rdtgroup_closid_show, .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, }, + { + .name = "schema_format", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_schema_format_show, + .fflags = RFTYPE_CTRL_INFO, + }, + }; static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) -- Gitee From 204a2ac609ef66253d687b6c2f9386e6ae97f41f Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 12:35:13 +0000 Subject: [PATCH 50/66] NVIDIA: SAUCE: fs/resctrl: Add L2 and L3 'MAX' resource schema ANBZ: #36714 commit 94d9955a2d3c9bc194ef02abb9ef0368e636d79f NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 MPAM can have both cache portion and cache capacity controls on any cache that supports MPAM. Cache portion bitmaps can be exposed via resctrl if they are implemented on L2 or L3. The cache capacity controls can not be used to isolate portions, which is in implicit in the L2 or L3 bitmap provided by user-space. These controls need to be configured with something more like a percentage. Add the resource enum entries for these two resources. No additional resctrl code is needed because the architecture code will specify this resource takes a 'percentage', re-using the support previously used only for the MB resource. Signed-off-by: James Morse (cherry picked from commit 2e9f961c2cad4bdcc49f1a598ee131725129337f https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `include/linux/resctrl.h`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- include/linux/resctrl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 5cce4feabab3..212c0deafbe0 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -55,6 +55,8 @@ enum resctrl_res_level { RDT_RESOURCE_MBA, RDT_RESOURCE_SMBA, RDT_RESOURCE_PERF_PKG, + RDT_RESOURCE_L3_MAX, + RDT_RESOURCE_L2_MAX, /* Must be the last */ RDT_NUM_RESOURCES, -- Gitee From b7b9d7ddc212de0fe0b81189c08c6acd15e3e139 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 11:51:03 +0000 Subject: [PATCH 51/66] NVIDIA: SAUCE: arm_mpam: resctrl: Add the glue code to convert to/from cmax ANBZ: #36714 commit 44cd5e901a74c40a3f5878e4fa1355cb87f6b1cb NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 MPAM's maximum cache-capacity controls take a fixed point fraction format. Instead of dumping this on user-space, convert it to a percentage. User-space using resctrl already knows how to handle percentages. Signed-off-by: James Morse (cherry picked from commit 10caa1269560b1006811725d9564f0e859a53e2e https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 67 ++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index d498f7aa8ce7..afc3a688c53d 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -764,6 +764,13 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return class->props.cpbm_wd <= 32; } +static bool cache_has_usable_cmax(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + return mpam_has_feature(mpam_feat_cmax_cmax, cprops); +} + static bool mba_class_use_mbw_max(struct mpam_props *cprops) { return (mpam_has_feature(mpam_feat_mbw_max, cprops) && @@ -898,6 +905,11 @@ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) return percent_to_fract16(pc, cprops->bwa_wd); } +static u16 percent_to_cmax(u8 pc, struct mpam_props *cprops) +{ + return percent_to_fract16(pc, cprops->cmax_wd); +} + static u32 get_mba_min(struct mpam_props *cprops) { if (!mba_class_use_mbw_max(cprops)) { @@ -1055,6 +1067,7 @@ static bool traffic_matches_l3(struct mpam_class *class) /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { + bool has_cpor, has_cmax; struct mpam_class *class; struct mpam_resctrl_res *res; @@ -1073,7 +1086,9 @@ static void mpam_resctrl_pick_caches(void) continue; } - if (!cache_has_usable_cpor(class)) { + has_cpor = cache_has_usable_cpor(class); + has_cmax = cache_has_usable_cmax(class); + if (!has_cpor && !has_cmax) { pr_debug("class %u cache misses CPOR\n", class->level); continue; } @@ -1084,12 +1099,22 @@ static void mpam_resctrl_pick_caches(void) cpumask_pr_args(cpu_possible_mask)); continue; } - - if (class->level == 2) - res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; - else - res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; - res->class = class; + if (has_cpor) { + pr_debug("pick_caches: Class has CPOR\n"); + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + res->class = class; + } + if (has_cmax) { + pr_debug("pick_caches: Class has CMAX\n"); + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2_MAX]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3_MAX]; + res->class = class; + } } } @@ -1413,6 +1438,23 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) */ r->cache.shareable_bits = resctrl_get_resource_default_ctrl(r); r->alloc_capable = true; + break; + case RDT_RESOURCE_L2_MAX: + case RDT_RESOURCE_L3_MAX: + r->alloc_capable = true; + r->schema_fmt = RESCTRL_SCHEMA_PERCENT; + r->membw.min_bw = max(100 / (1 << cprops->cmax_wd), 1); + r->membw.bw_gran = max(100 / (1 << cprops->cmax_wd), 1); + r->membw.max_bw = 100; + + if (r->rid == RDT_RESOURCE_L2_MAX) { + r->name = "L2_MAX"; + r->ctrl_scope = RESCTRL_L2_CACHE; + } else { + r->name = "L3_MAX"; + r->ctrl_scope = RESCTRL_L3_CACHE; + } + break; case RDT_RESOURCE_MBA: r->schema_fmt = RESCTRL_SCHEMA_PERCENT; @@ -1584,6 +1626,10 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case RDT_RESOURCE_L3: configured_by = mpam_feat_cpor_part; break; + case RDT_RESOURCE_L2_MAX: + case RDT_RESOURCE_L3_MAX: + configured_by = mpam_feat_cmax_cmax; + break; case RDT_RESOURCE_MBA: if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { configured_by = mpam_feat_mbw_max; @@ -1601,6 +1647,8 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, switch (configured_by) { case mpam_feat_cpor_part: return cfg->cpbm; + case mpam_feat_cmax_cmax: + return fract16_to_percent(cfg->cmax, cprops->cmax_wd); case mpam_feat_mbw_max: return mbw_max_to_percent(cfg->mbw_max, cprops); default: @@ -1653,6 +1701,11 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, cfg.cpbm = cfg_val; mpam_set_feature(mpam_feat_cpor_part, &cfg); break; + case RDT_RESOURCE_L2_MAX: + case RDT_RESOURCE_L3_MAX: + cfg.cmax = percent_to_cmax(cfg_val, cprops); + mpam_set_feature(mpam_feat_cmax_cmax, &cfg); + break; case RDT_RESOURCE_MBA: if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops); -- Gitee From b25e4e8ebadf827fc85c115f8e5635fc7e78467b Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 24 Nov 2025 17:14:47 -0600 Subject: [PATCH 52/66] NVIDIA: VR: SAUCE: arm_mpam: Avoid MSC teardown for the SW programming errors ANBZ: #36714 commit 1e4229deb565137e7bb29112ad90901fb375f408 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 No need to destory MSC instance for the user/admin programming errors sicne it's not causing any functional issues. Signed-off-by: Shanker Donthineni (cherry picked from commit 7d348a2cf872998f094587434d1e6e61f2017445 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 4ef5414ce0ae..b22da65e8959 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2535,6 +2535,12 @@ static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc) msc->id, mpam_errcode_names[errcode], partid, pmg, ris); + /* No action is required for the MPAM programming errors */ + if ((errcode != MPAM_ERRCODE_REQ_PARTID_RANGE) && + (errcode != MPAM_ERRCODE_REQ_PMG_RANGE)) { + return IRQ_HANDLED; + } + /* Disable this interrupt. */ mpam_disable_msc_ecr(msc); -- Gitee From b4a7a8bf08889acb7e396d278d4c47d1e2dc1c50 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 13 May 2025 11:44:23 -0500 Subject: [PATCH 53/66] NVIDIA: VR: SAUCE: arm_mpam: Handle CPU-less numa nodes ANBZ: #36714 commit e598d1bcfe490bedcfd6b669668d635e99aacb88 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 In a NUMA system, each node may include CPUs, memory, MPAM MSC instances, or any combination thereof. Some high-end servers may have NUMA nodes that include MPAM MSC but no CPUs. In such cases, associate all possible CPUs for those MSCs. Signed-off-by: Shanker Donthineni (cherry picked from commit 95f0fd86a3d4ff75ecda369136e905e329547dc1 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index b22da65e8959..c1c9f865238c 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -527,6 +527,10 @@ static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, case MPAM_CLASS_MEMORY: get_cpumask_from_node_id(comp->comp_id, affinity); /* affinity may be empty for CPU-less memory nodes */ + if (cpumask_empty(affinity)) { + dev_warn_once(&msc->pdev->dev, "CPU-less numa node"); + cpumask_copy(affinity, cpu_possible_mask); + } break; case MPAM_CLASS_UNKNOWN: return 0; -- Gitee From fb530467e542d1981ac419059dc0caa67dedeb12 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 24 Nov 2025 15:04:47 -0600 Subject: [PATCH 54/66] NVIDIA: SAUCE: arm_mpam: Include all associated MSC components during domain setup ANBZ: #36714 commit 1f642b98973c4cac37455995055b2a3676472208 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 The current MPAM driver only considers the first component associated with an online/offline CPU during domain creation and teardown. This is insufficient, as CPU-initiated traffic may traverse multiple MSCs before reaching the target, and each MSC must be programmed consistently for proper resource partitioning. Update the MPAM driver to include all components associated with a given CPU during domain setup/teardown to expose expected schemata to userspace for effective resource control. Signed-off-by: Shanker Donthineni (forward ported from commit ac1e5be5e8fddc807e9c5bbc10da3797a601bc95 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Leaves drivers/resctrl/mpam_internal.h untouched; mpam_resctrl_offline_cpu() is already void in the baseline used here. - Tightens callers (mpam_resctrl_pick_mba, mpam_resctrl_pick_counters) around traffic_matches_l3() together with topology_matches_l3() and cpumask_equal(&class->affinity, cpu_possible_mask) and does not add a traffic_matches_l3() function body here, which is already defined in upstream. - Omits any edit to exposed_alloc_capable or exposed_mon_capable; those symbols are already absent from the baseline in favor of resctrl_arch_alloc_capable() / resctrl_arch_mon_capable(). - Does not add MPAM_MAX_EVENT or a new for_each_mpam_resctrl_mon() / mpam_resctrl_counters[] sizing hunk because that monitor macro and array shape are already in the baseline. - Omits INIT_LIST_HEAD_RCU() on res->resctrl_res.ctrl_domains and mon_domains, omits moving mpam_resctrl_domain_insert() after resctrl_online_*(), and omits adding static void mpam_resctrl_online_domain_hdr(); that list setup and insert ordering are already in the baseline. - Does not replay a void->int conversion for mpam_resctrl_monitor_init() or a mpam_pmg_max + 1 num_rmid path; the baseline already has int-returning mpam_resctrl_monitor_init() and resctrl_arch_system_num_rmid_idx() for num_rmid, so only surrounding line context shifts in this file. - Adds mpam_resctrl_mon_from_res() / mpam_resctrl_res_from_mon(), mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *r), extends mpam_resctrl_alloc_domain() / mpam_resctrl_get_domain_from_cpu() / mpam_resctrl_get_mon_domain_from_cpu() with struct mpam_component *comp, hardens topology_matches_l3() with matched_once, switches resctrl_arch_mbm_cntr_assign_enabled() to use mon->assigned_counters, and extends mpam_resctrl_pick_domain_id() so memory level > 3 uses component IDs like cache-backed classes] - Modify matched_once and cpu checking in topology_matches_l3() to have a better error handling; Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 3 +- drivers/resctrl/mpam_resctrl.c | 236 +++++++++++++++++++++------------ fs/resctrl/internal.h | 9 +- fs/resctrl/monitor.c | 95 +++++++++---- fs/resctrl/rdtgroup.c | 26 ++-- 5 files changed, 240 insertions(+), 129 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index c1c9f865238c..f5fc450858de 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -530,7 +530,8 @@ static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, if (cpumask_empty(affinity)) { dev_warn_once(&msc->pdev->dev, "CPU-less numa node"); cpumask_copy(affinity, cpu_possible_mask); - } + } else if (class->level > 3) + cpumask_copy(affinity, cpu_possible_mask); break; case MPAM_CLASS_UNKNOWN: return 0; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index afc3a688c53d..df53406cbe3c 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -175,18 +175,48 @@ static void resctrl_reset_task_closids(void) read_unlock(&tasklist_lock); } -static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *l3) +static struct mpam_resctrl_mon *mpam_resctrl_mon_from_res(struct mpam_resctrl_res *res) { - l3->mon.num_mbm_cntrs = l3_num_allocated_mbwu; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + if (!res->class) + return NULL; + + for_each_mpam_resctrl_mon(mon, eventid) { + if (mon->class == res->class) + return mon; + } + return NULL; +} + +static struct mpam_resctrl_res *mpam_resctrl_res_from_mon(struct mpam_resctrl_mon *mon) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + if (!mon->class) + return NULL; + + for_each_mpam_resctrl_control(res, rid) { + if (res->class == mon->class) + return res; + } + return NULL; +} + +static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *r) +{ + r->mon.num_mbm_cntrs = l3_num_allocated_mbwu; if (cdp_enabled) - l3->mon.num_mbm_cntrs /= 2; + r->mon.num_mbm_cntrs /= 2; - if (l3->mon.num_mbm_cntrs) { - l3->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); - l3->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); + if (r->mon.num_mbm_cntrs) { + r->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); + r->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); } else { - l3->mon.mbm_cntr_assignable = false; - l3->mon.mbm_assign_on_mkdir = false; + r->mon.mbm_cntr_assignable = false; + r->mon.mbm_assign_on_mkdir = false; } } @@ -957,10 +987,11 @@ static bool topology_matches_l3(struct mpam_class *victim) { int cpu, err; struct mpam_component *victim_iter; + bool matched_once = false; + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; lockdep_assert_cpus_held(); - cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) return false; @@ -974,8 +1005,11 @@ static bool topology_matches_l3(struct mpam_class *victim) } cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); - if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + if (cpu >= nr_cpu_ids) { + if (matched_once) + continue; return false; + } cpumask_clear(tmp_cpumask); err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); @@ -995,6 +1029,7 @@ static bool topology_matches_l3(struct mpam_class *victim) return false; } + matched_once = true; } return true; @@ -1146,13 +1181,15 @@ static void mpam_resctrl_pick_mba(void) continue; } - if (!topology_matches_l3(class)) { + if ((class->level == 3) && !topology_matches_l3(class)) { pr_debug("class %u topology doesn't match L3\n", class->level); continue; } - if (!traffic_matches_l3(class)) { + /* Check memory at egress from L3 for MSC with L3 */ + if (!cpumask_equal(&class->affinity, cpu_possible_mask) && + !traffic_matches_l3(class)) { pr_debug("class %u traffic doesn't match L3 egress\n", class->level); continue; @@ -1319,7 +1356,10 @@ static void mpam_resctrl_pick_counters(void) } has_mbwu = class_has_usable_mbwu(class); - if (has_mbwu && topology_matches_l3(class)) { + if (has_mbwu && + ((class->type == MPAM_CLASS_MEMORY) || + (topology_matches_l3(class) && + traffic_matches_l3(class)))) { pr_debug("class %u has usable MBWU, and matches L3 topology", class->level); /* @@ -1389,10 +1429,16 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain * bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { - if (r != &mpam_resctrl_controls[RDT_RESOURCE_L3].resctrl_res) + struct mpam_resctrl_res *res; + struct mpam_resctrl_mon *mon; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + + mon = mpam_resctrl_mon_from_res(res); + if (!mon) return false; - return mpam_resctrl_abmc_enabled(); + return mon->assigned_counters ? true : false; } int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) @@ -1459,7 +1505,6 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) case RDT_RESOURCE_MBA: r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->ctrl_scope = RESCTRL_L3_CACHE; - r->mba.delay_linear = true; r->mba.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = get_mba_min(cprops); @@ -1483,6 +1528,9 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; + if ((class->type == MPAM_CLASS_MEMORY) && (class->level > 3)) + return comp->comp_id; + if (topology_matches_l3(class)) { /* Use the corresponding L3 component ID as the domain ID */ int id = get_cpu_cacheinfo_id(cpu, 3); @@ -1504,10 +1552,10 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) */ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) { - struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct mpam_resctrl_res *res = mpam_resctrl_res_from_mon(mon); size_t array_size = resctrl_arch_system_num_rmid_idx() * sizeof(int); int *rmid_array __free(kfree) = kmalloc(array_size, GFP_KERNEL); - struct rdt_resource *l3 = &res->resctrl_res; + struct rdt_resource *r = &res->resctrl_res; struct mpam_class *class = mon->class; u16 num_mbwu_mon; @@ -1528,7 +1576,7 @@ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) return PTR_ERR(mon->assigned_counters); mon->mbwu_idx_to_mon = no_free_ptr(rmid_array); - mpam_resctrl_monitor_sync_abmc_vals(l3); + mpam_resctrl_monitor_sync_abmc_vals(r); return 0; } @@ -1536,8 +1584,15 @@ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, enum resctrl_event_id type) { - struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; - struct rdt_resource *l3 = &res->resctrl_res; + struct mpam_resctrl_res *res; + struct rdt_resource *r; + + if ((mon->class->type == MPAM_CLASS_MEMORY) && (mon->class->level > 3)) + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + + r = &res->resctrl_res; lockdep_assert_cpus_held(); @@ -1564,8 +1619,12 @@ static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, * monitoring class. * Setting name is necessary on monitor only platforms. */ - l3->name = "L3"; - l3->mon_scope = RESCTRL_L3_CACHE; + if ((mon->class->type == MPAM_CLASS_MEMORY) && (mon->class->level > 3)) { + r->name = "MB"; + } else { + r->name = "L3"; + } + r->mon_scope = RESCTRL_L3_CACHE; /* * num-rmid is the upper bound for the number of monitoring groups that @@ -1575,10 +1634,10 @@ static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, * this does mean userspace needs to know the architecture to correctly * interpret this value. */ - l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + r->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); if (resctrl_enable_mon_event(type, false, 0, NULL)) - l3->mon_capable = true; + r->mon_capable = true; switch (type) { case QOS_L3_MBM_LOCAL_EVENT_ID: @@ -1850,41 +1909,26 @@ static struct mpam_component *find_component(struct mpam_class *class, int cpu) } static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res, + struct mpam_component *comp) { int err; struct mpam_resctrl_dom *dom; struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; - struct mpam_class *class = res->class; - struct mpam_component *comp_iter, *ctrl_comp; struct rdt_resource *r = &res->resctrl_res; lockdep_assert_held(&domain_list_lock); - ctrl_comp = NULL; - guard(srcu)(&mpam_srcu); - list_for_each_entry_srcu(comp_iter, &class->components, class_list, - srcu_read_lock_held(&mpam_srcu)) { - if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { - ctrl_comp = comp_iter; - break; - } - } - - /* class has no component for this CPU */ - if (WARN_ON_ONCE(!ctrl_comp)) - return ERR_PTR(-EINVAL); - dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); if (!dom) return ERR_PTR(-ENOMEM); - if (r->alloc_capable) { - dom->ctrl_comp = ctrl_comp; + if (resctrl_arch_alloc_capable()) { + dom->ctrl_comp = comp; ctrl_d = &dom->resctrl_ctrl_dom; - mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr); + mpam_resctrl_domain_hdr_init(cpu, comp, r->rid, &ctrl_d->hdr); ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; err = resctrl_online_ctrl_domain(r, ctrl_d); if (err) @@ -1895,7 +1939,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) pr_debug("Skipped control domain online - no controls\n"); } - if (r->mon_capable) { + if (resctrl_arch_mon_capable()) { struct mpam_component *any_mon_comp = NULL; struct mpam_resctrl_mon *mon; enum resctrl_event_id eventid; @@ -1914,7 +1958,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) if (!mon->class) continue; // dummy resource - mon_comp = find_component(mon->class, cpu); + mon_comp = comp ? comp: find_component(mon->class, cpu); dom->mon_comp[eventid] = mon_comp; if (mon_comp) any_mon_comp = mon_comp; @@ -1940,7 +1984,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) return dom; offline_ctrl_domain: - if (r->alloc_capable) { + if (resctrl_arch_alloc_capable()) { mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); resctrl_offline_ctrl_domain(r, ctrl_d); } @@ -1958,7 +2002,8 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id * for anything that is not a cache. */ -static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) +static struct mpam_resctrl_dom * +mpam_resctrl_get_mon_domain_from_cpu(int cpu, struct mpam_component *comp) { int cache_id; struct mpam_resctrl_dom *dom; @@ -1972,7 +2017,9 @@ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) if (cache_id < 0) return NULL; - list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + list_for_each_entry(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + if (comp && (dom->ctrl_comp != comp)) + continue; if (dom->resctrl_mon_dom.hdr.id == cache_id) return dom; } @@ -1981,7 +2028,8 @@ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) } static struct mpam_resctrl_dom * -mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) +mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res, + struct mpam_component *comp) { struct mpam_resctrl_dom *dom; struct rdt_resource *r = &res->resctrl_res; @@ -1989,6 +2037,8 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) lockdep_assert_cpus_held(); list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) { + if (comp && (dom->ctrl_comp != comp)) + continue; if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) return dom; } @@ -1997,38 +2047,44 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) return NULL; /* Search the mon domain list too - needed on monitor only platforms. */ - return mpam_resctrl_get_mon_domain_from_cpu(cpu); + return mpam_resctrl_get_mon_domain_from_cpu(cpu, comp); } int mpam_resctrl_online_cpu(unsigned int cpu) { + struct rdt_l3_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; struct mpam_resctrl_res *res; enum resctrl_res_level rid; + struct mpam_component *comp; guard(mutex)(&domain_list_lock); for_each_mpam_resctrl_control(res, rid) { struct mpam_resctrl_dom *dom; - struct rdt_resource *r = &res->resctrl_res; if (!res->class) continue; // dummy_resource; + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &res->class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &comp->affinity)) + continue; - dom = mpam_resctrl_get_domain_from_cpu(cpu, res); - if (!dom) { - dom = mpam_resctrl_alloc_domain(cpu, res); - if (IS_ERR(dom)) - return PTR_ERR(dom); - } else { - if (r->alloc_capable) { - struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom; - - mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); - } - if (r->mon_capable) { - struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom; - - mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + dom = mpam_resctrl_get_domain_from_cpu(cpu, res, comp); + if (!dom) { + dom = mpam_resctrl_alloc_domain(cpu, res, comp); + } else { + if (resctrl_arch_alloc_capable()) { + ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); + } + if (resctrl_arch_mon_capable()) { + mon_d = &dom->resctrl_mon_dom; + mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + } } + if (IS_ERR(dom)) + return PTR_ERR(dom); } } @@ -2039,6 +2095,7 @@ int mpam_resctrl_online_cpu(unsigned int cpu) void mpam_resctrl_offline_cpu(unsigned int cpu) { + struct mpam_component *comp; struct mpam_resctrl_res *res; enum resctrl_res_level rid; @@ -2050,35 +2107,38 @@ void mpam_resctrl_offline_cpu(unsigned int cpu) struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; bool ctrl_dom_empty, mon_dom_empty; - struct rdt_resource *r = &res->resctrl_res; if (!res->class) continue; // dummy resource - dom = mpam_resctrl_get_domain_from_cpu(cpu, res); - if (WARN_ON_ONCE(!dom)) - continue; + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &res->class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &comp->affinity)) + continue; + dom = mpam_resctrl_get_domain_from_cpu(cpu, res, comp); + if (WARN_ON_ONCE(!dom)) + continue; - if (r->alloc_capable) { - ctrl_d = &dom->resctrl_ctrl_dom; - ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); - if (ctrl_dom_empty) - resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); - } else { ctrl_dom_empty = true; - } + if (resctrl_arch_alloc_capable()) { + ctrl_d = &dom->resctrl_ctrl_dom; + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + if (ctrl_dom_empty) + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + } - if (r->mon_capable) { - mon_d = &dom->resctrl_mon_dom; - mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); - if (mon_dom_empty) - resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); - } else { mon_dom_empty = true; - } + if (resctrl_arch_mon_capable()) { + mon_d = &dom->resctrl_mon_dom; + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + if (mon_dom_empty) + resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); + } - if (ctrl_dom_empty && mon_dom_empty) - kfree(dom); + if (ctrl_dom_empty && mon_dom_empty) + kfree(dom); + } } } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 0460bbbce7a1..75dc5c7cdae7 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -387,9 +387,9 @@ int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); -int resctrl_l3_mon_resource_init(void); +int resctrl_mon_init(void); -void resctrl_l3_mon_resource_exit(void); +void resctrl_mon_exit(void); void mon_event_count(void *info); @@ -476,6 +476,11 @@ ssize_t resctrl_io_alloc_cbm_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); u32 resctrl_io_alloc_closid(struct rdt_resource *r); +int mbm_MB_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t mbm_MB_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 94da0360952d..b664fb16b4c6 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -450,7 +450,7 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) struct mbm_state *m; u64 tval = 0; - if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) { + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, rr->r->rid)) { rr->err = -EIO; return -EINVAL; } @@ -547,6 +547,7 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { switch (rr->r->rid) { case RDT_RESOURCE_L3: + case RDT_RESOURCE_MBA: WARN_ON_ONCE(rr->evt->any_cpu); if (rr->hdr) return __l3_mon_event_count(rdtgrp, rr); @@ -592,7 +593,7 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) struct rdt_l3_mon_domain *d; struct mbm_state *m; - if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, rr->r->rid)) return; d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); m = get_mbm_state(d, closid, rmid, rr->evt->evtid); @@ -1002,7 +1003,7 @@ void free_rmid_lru_list(void) */ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { MON_EVENT(QOS_L3_OCCUP_EVENT_ID, "llc_occupancy", RDT_RESOURCE_L3, false), - MON_EVENT(QOS_L3_MBM_TOTAL_EVENT_ID, "mbm_total_bytes", RDT_RESOURCE_L3, false), + MON_EVENT(QOS_L3_MBM_TOTAL_EVENT_ID, "mbm_total_bytes", RDT_RESOURCE_MBA, false), MON_EVENT(QOS_L3_MBM_LOCAL_EVENT_ID, "mbm_local_bytes", RDT_RESOURCE_L3, false), MON_EVENT(PMT_EVENT_ENERGY, "core_energy", RDT_RESOURCE_PERF_PKG, true), MON_EVENT(PMT_EVENT_ACTIVITY, "activity", RDT_RESOURCE_PERF_PKG, true), @@ -1631,9 +1632,9 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, return ret; } -int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +static int mbm_assignments_show(struct kernfs_open_file *of, struct seq_file *s, + void *v, struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; struct mon_evt *mevt; @@ -1679,6 +1680,18 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi return ret; } +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + return mbm_assignments_show(of, s, v, + resctrl_arch_get_resource(RDT_RESOURCE_L3)); +} + +int mbm_MB_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + return mbm_assignments_show(of, s, v, + resctrl_arch_get_resource(RDT_RESOURCE_MBA)); +} + /* * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching * event name. @@ -1773,10 +1786,10 @@ static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup return -EINVAL; } -ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) +static ssize_t mbm_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, + struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); struct rdtgroup *rdtgrp; char *token, *event; int ret = 0; @@ -1818,6 +1831,20 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + return mbm_assignments_write(of, buf, nbytes, off, + resctrl_arch_get_resource(RDT_RESOURCE_L3)); +} + +ssize_t mbm_MB_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + return mbm_assignments_write(of, buf, nbytes, off, + resctrl_arch_get_resource(RDT_RESOURCE_MBA)); +} + static int closid_num_dirty_rmid_alloc(struct rdt_resource *r) { if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { @@ -1858,7 +1885,7 @@ static void closid_num_dirty_rmid_free(void) } /** - * resctrl_l3_mon_resource_init() - Initialise global monitoring structures. + * resctrl_mon_resource_init() - Initialise global monitoring structures. * * Allocate and initialise global monitor resources that do not belong to a * specific domain. i.e. the closid_num_dirty_rmid[] used to find the CLOSID @@ -1870,27 +1897,21 @@ static void closid_num_dirty_rmid_free(void) * * Return: 0 for success, or -ENOMEM. */ -int resctrl_l3_mon_resource_init(void) +static void resctrl_mon_resource_init(struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - int ret; + unsigned long fflags; - if (!r->mon_capable) - return 0; - - ret = closid_num_dirty_rmid_alloc(r); - if (ret) - return ret; + fflags = (r->rid == RDT_RESOURCE_MBA) ? RFTYPE_RES_MB :RFTYPE_RES_CACHE; if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); } if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) @@ -1908,19 +1929,43 @@ int resctrl_l3_mon_resource_init(void) NON_TEMP_WRITE_TO_LOCAL_MEM); r->mon.mbm_assign_on_mkdir = true; resctrl_file_fflags_init("num_mbm_cntrs", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); resctrl_file_fflags_init("available_mbm_cntrs", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | - RFTYPE_RES_CACHE); - resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + fflags); + if (r->rid == RDT_RESOURCE_MBA) + resctrl_file_fflags_init("mbm_MB_assignments", RFTYPE_MON_BASE); + else + resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + resctrl_file_fflags_init("mbm_assign_mode", RFTYPE_MON_INFO | + fflags); } +} + +int resctrl_mon_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + int ret; + + if (!r->mon_capable) + return 0; + + ret = closid_num_dirty_rmid_alloc(r); + if (ret) + return ret; + + resctrl_mon_resource_init(r); + + r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + if (r) + resctrl_mon_resource_init(r); return 0; } -void resctrl_l3_mon_resource_exit(void) +void resctrl_mon_exit(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 829598af0549..062e4965807f 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2188,6 +2188,13 @@ static struct rftype res_common_files[] = { .seq_show = mbm_L3_assignments_show, .write = mbm_L3_assignments_write, }, + { + .name = "mbm_MB_assignments", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_MB_assignments_show, + .write = mbm_MB_assignments_write, + }, { .name = "mbm_assign_mode", .mode = 0644, @@ -4664,10 +4671,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h if (resctrl_mounted && resctrl_arch_mon_capable()) rmdir_mondata_subdir_allrdtgrp(r, hdr); - if (r->rid != RDT_RESOURCE_L3) - goto out_unlock; - - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) goto out_unlock; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); @@ -4773,10 +4777,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr mutex_lock(&rdtgroup_mutex); - if (r->rid != RDT_RESOURCE_L3) - goto mkdir; - - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) goto out_unlock; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); @@ -4793,7 +4794,6 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); -mkdir: err = 0; /* * If the filesystem is not mounted then only the default resource group @@ -4899,13 +4899,13 @@ int resctrl_init(void) io_alloc_init(); - ret = resctrl_l3_mon_resource_init(); + ret = resctrl_mon_init(); if (ret) return ret; ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) { - resctrl_l3_mon_resource_exit(); + resctrl_mon_exit(); return ret; } @@ -4940,7 +4940,7 @@ int resctrl_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_l3_mon_resource_exit(); + resctrl_mon_exit(); return ret; } @@ -5003,6 +5003,6 @@ void resctrl_exit(void) * it can be used to umount resctrl. */ - resctrl_l3_mon_resource_exit(); + resctrl_mon_exit(); free_rmid_lru_list(); } -- Gitee From 9b3c3c6afc950363e4646537f006ea3bd55975b1 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:42 +0000 Subject: [PATCH 55/66] NVIDIA: VR: SAUCE: resctrl: add MB_HLIM resource ID and schema type ANBZ: #36714 commit c9039b4c3c6d5250a8c33a7fe1bf07e14ef0ed35 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Add RDT_RESOURCE_MB_HLIM and RESCTRL_SCHEMA_MB_HLIM for per-domain MBW maximum hard-limit control on ARM MPAM. Document the schema in kernel-doc. Extend resctrl_get_default_ctrl() for RESCTRL_SCHEMA_MB_HLIM (default 0). (cherry picked from commit 9fc8c60cb42b519bb85c37ede8d44b847b18ae40 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- include/linux/resctrl.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 212c0deafbe0..8071ee0bb5a5 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -57,6 +57,7 @@ enum resctrl_res_level { RDT_RESOURCE_PERF_PKG, RDT_RESOURCE_L3_MAX, RDT_RESOURCE_L2_MAX, + RDT_RESOURCE_MB_HLIM, /* Must be the last */ RDT_NUM_RESOURCES, @@ -291,12 +292,15 @@ enum resctrl_scope { * @RESCTRL_SCHEMA_PERCENT: The schema is a percentage. * @RESCTRL_SCHEMA_MBPS: The schema ia a MBps value. * @RESCTRL_SCHEMA__AMD_MBA: The schema value is MBA for AMD platforms. + * @RESCTRL_SCHEMA_MB_HLIM: Per-domain MBW max hard limit (0/1), ARM MPAM only + * when MPAMF_MBW_IDR.MAX_LIM is 0b00 (HARDLIM RW). */ enum resctrl_schema_fmt { RESCTRL_SCHEMA_BITMAP, RESCTRL_SCHEMA_PERCENT, RESCTRL_SCHEMA_MBPS, RESCTRL_SCHEMA__AMD_MBA, + RESCTRL_SCHEMA_MB_HLIM, }; /** @@ -434,6 +438,8 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: return r->membw.max_bw; + case RESCTRL_SCHEMA_MB_HLIM: + return 0; } return WARN_ON_ONCE(1); -- Gitee From c7b18da1d8109516f36decead370fd22d38009ff Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:42 +0000 Subject: [PATCH 56/66] NVIDIA: VR: SAUCE: resctrl: wire MB_HLIM schemata parsing and group setup ANBZ: #36714 commit 1133f1e51768a06aa252859d4a7d9dc737bb8d95 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Add mb_hlim parsing and validation (0/1), wire RESCTRL_SCHEMA_MB_HLIM into schemata and control-group setup, align MB_HLIM with MBA for exclusive mode and pseudo-locking, and default new groups to hardlim off. (cherry picked from commit 6911e81a21d23c2622c9dbf6e7b374d67dd4e672 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/ctrlmondata.c | 33 ++++++++++++++++++++++++++++++++- fs/resctrl/rdtgroup.c | 23 ++++++++++++++++++++++- include/linux/resctrl.h | 2 ++ 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 8d00aeacc337..138321730f73 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -90,6 +90,33 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, return 0; } +static bool hlim_validate(char *buf, u32 *data) +{ + int ret = kstrtou32(buf, 10, data); + + if (ret || (*data != 0 && *data != 1)) { + rdt_last_cmd_printf("Invalid MB_HLIM value %s (expect 0 or 1)\n", buf); + return false; + } + return true; +} + +static int parse_mb_hlim(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_ctrl_domain *d) +{ + struct resctrl_staged_config *cfg; + u32 v; + + if (!hlim_validate(data->buf, &v)) + return -EINVAL; + + cfg = &d->staged_config[s->conf_type]; + cfg->new_ctrl = v; + cfg->have_new_ctrl = true; + + return 0; +} + /* * Check whether a cache bit mask is valid. * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: @@ -224,13 +251,17 @@ static int parse_line(char *line, struct resctrl_schema *s, case RESCTRL_SCHEMA__AMD_MBA: parse_ctrlval = &parse_bw; break; + case RESCTRL_SCHEMA_MB_HLIM: + parse_ctrlval = &parse_mb_hlim; + break; } if (WARN_ON_ONCE(!parse_ctrlval)) return -EINVAL; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { + (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA || + r->rid == RDT_RESOURCE_MB_HLIM)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); return -EINVAL; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 062e4965807f..53a6d4d33b01 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1514,7 +1514,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) + if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA || + r->rid == RDT_RESOURCE_MB_HLIM) continue; has_cache = true; list_for_each_entry(d, &r->ctrl_domains, hdr.list) { @@ -1811,6 +1812,9 @@ static int resctrl_schema_format_show(struct kernfs_open_file *of, case RESCTRL_SCHEMA__AMD_MBA: seq_puts(seq, "platform\n"); break; + case RESCTRL_SCHEMA_MB_HLIM: + seq_puts(seq, "0/1\n"); + break; } return 0; @@ -2595,6 +2599,7 @@ static u32 fflags_from_schema(struct resctrl_schema *s) fflags |= RFTYPE_SCHEMA_MBPS; break; case RESCTRL_SCHEMA__AMD_MBA: + case RESCTRL_SCHEMA_MB_HLIM: /* No standard files are exposed */ break; } @@ -3009,6 +3014,7 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: + case RESCTRL_SCHEMA_MB_HLIM: s->fmt_str = "%d=%u"; break; } @@ -3977,6 +3983,19 @@ static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) } } +/* Initialize MB_HLIM resource with default hardlim off (0). */ +static void rdtgroup_init_mb_hlim(struct resctrl_schema *s) +{ + struct resctrl_staged_config *cfg; + struct rdt_ctrl_domain *d; + + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { + cfg = &d->staged_config[s->conf_type]; + cfg->new_ctrl = 0; + cfg->have_new_ctrl = true; + } +} + /* Initialize the RDT group's allocations. */ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) { @@ -3993,6 +4012,8 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) rdtgroup_init_mba(r, rdtgrp->closid); if (is_mba_sc(r)) continue; + } else if (r->rid == RDT_RESOURCE_MB_HLIM) { + rdtgroup_init_mb_hlim(s); } else { ret = rdtgroup_init_cat(s, rdtgrp->closid); if (ret < 0) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 8071ee0bb5a5..99ecfa8764fc 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -459,6 +459,8 @@ static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: return s->membw.max_bw; + case RESCTRL_SCHEMA_MB_HLIM: + return 0; } return WARN_ON_ONCE(1); -- Gitee From a91b466cb831f65b02cffcd3659b481fe3257d57 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:48 +0000 Subject: [PATCH 57/66] NVIDIA: VR: SAUCE: resctrl/mpam: probe MPAMF_MBW_IDR MAX_LIM and hardlim_rw ANBZ: #36714 commit ca37d5c8e00a51091a3974d39b5cf9d58db53652 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Read mbw_max_lim from MPAMF_MBW_IDR.MAX_LIM when MBW_MAX is present, derive mpam_feat_mbw_max_hardlim_rw when both soft and hard limiting are supported, and merge mbw_max_lim and the feature across MSCs. Add mpam_props_sync_mbw_max_hardlim_rw() and propagate merged state in __props_mismatch(). (cherry picked from commit ced4d460ce56aa8b18d8ab3f19eeb76beeb25322 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 40 ++++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 7 ++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index f5fc450858de..029d89cb4da4 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -789,6 +789,19 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) #define mpam_ris_hw_probe_hw_nrdy(_ris, _mon_reg) \ _mpam_ris_hw_probe_hw_nrdy(_ris, MSMON_##_mon_reg) +/* Align mpam_feat_mbw_max_hardlim_rw with MPAMF_MBW_IDR.MAX_LIM and mbw_max. */ +static void mpam_props_sync_mbw_max_hardlim_rw(struct mpam_props *props) +{ + if (!mpam_has_feature(mpam_feat_mbw_max, props)) { + mpam_clear_feature(mpam_feat_mbw_max_hardlim_rw, props); + return; + } + if (props->mbw_max_lim == 0) + mpam_set_feature(mpam_feat_mbw_max_hardlim_rw, props); + else + mpam_clear_feature(mpam_feat_mbw_max_hardlim_rw, props); +} + static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) { int err; @@ -836,6 +849,8 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) if (FIELD_GET(MPAMF_IDR_HAS_MBW_PART, ris->idr)) { u32 mbw_features = mpam_read_partsel_reg(msc, MBW_IDR); + props->mbw_max_lim = 0; + /* portion bitmap resolution */ props->mbw_pbm_bits = FIELD_GET(MPAMF_MBW_IDR_BWPBM_WD, mbw_features); if (props->mbw_pbm_bits && @@ -850,14 +865,18 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) */ props->bwa_wd = min(props->bwa_wd, 16); - if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) { mpam_set_feature(mpam_feat_mbw_max, props); + props->mbw_max_lim = FIELD_GET(MPAMF_MBW_IDR_MAX_LIM, mbw_features); + } if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MIN, mbw_features)) mpam_set_feature(mpam_feat_mbw_min, props); if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_PROP, mbw_features)) mpam_set_feature(mpam_feat_mbw_prop, props); + + mpam_props_sync_mbw_max_hardlim_rw(props); } /* Priority partitioning */ @@ -2267,12 +2286,31 @@ static void __props_mismatch(struct mpam_props *parent, if (alias && !mpam_has_bwa_wd_feature(parent) && mpam_has_bwa_wd_feature(child)) { parent->bwa_wd = child->bwa_wd; + parent->mbw_max_lim = child->mbw_max_lim; + if (mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, child)) + mpam_set_feature(mpam_feat_mbw_max_hardlim_rw, parent); + else + mpam_clear_feature(mpam_feat_mbw_max_hardlim_rw, parent); } else if (MISMATCHED_HELPER(parent, child, mpam_has_bwa_wd_feature, bwa_wd, alias)) { pr_debug("took the min bwa_wd\n"); parent->bwa_wd = min(parent->bwa_wd, child->bwa_wd); } + if (CAN_MERGE_FEAT(parent, child, mpam_feat_mbw_max, alias)) { + parent->mbw_max_lim = child->mbw_max_lim; + if (mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, child)) + mpam_set_feature(mpam_feat_mbw_max_hardlim_rw, parent); + else + mpam_clear_feature(mpam_feat_mbw_max_hardlim_rw, parent); + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_mbw_max, + mbw_max_lim, alias)) { + pr_debug("%s mbw_max_lim mismatch, clearing mbw_max\n", __func__); + mpam_clear_feature(mpam_feat_mbw_max, parent); + parent->mbw_max_lim = 0; + mpam_props_sync_mbw_max_hardlim_rw(parent); + } + if (alias && !mpam_has_cmax_wd_feature(parent) && mpam_has_cmax_wd_feature(child)) { parent->cmax_wd = child->cmax_wd; } else if (MISMATCHED_HELPER(parent, child, mpam_has_cmax_wd_feature, diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 407f0b2f5014..402c4f8b7e62 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -206,6 +206,7 @@ enum mpam_device_features { mpam_feat_mbw_part, mpam_feat_mbw_min, mpam_feat_mbw_max, + mpam_feat_mbw_max_hardlim_rw, mpam_feat_mbw_prop, mpam_feat_intpri_part, mpam_feat_intpri_part_0_low, @@ -239,6 +240,11 @@ struct mpam_props { u16 dspri_wd; u16 num_csu_mon; u16 num_mbwu_mon; + /* + * MPAMF_MBW_IDR.MAX_LIM [1:0] when mpam_feat_mbw_max; else 0. + * 0 = soft+hard, 1 = soft only, 2 = hard only, 3 = reserved. + */ + u8 mbw_max_lim; /* * Kunit tests use memset() to set up feature combinations that should be @@ -630,6 +636,7 @@ static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } /* MPAMF_MBW_IDR - MPAM features memory bandwidth partitioning ID register */ #define MPAMF_MBW_IDR_BWA_WD GENMASK(5, 0) +#define MPAMF_MBW_IDR_MAX_LIM GENMASK(9, 8) #define MPAMF_MBW_IDR_HAS_MIN BIT(10) #define MPAMF_MBW_IDR_HAS_MAX BIT(11) #define MPAMF_MBW_IDR_HAS_PBM BIT(12) -- Gitee From 0564f9f411222f70ac6e2b63de225436b6f92530 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:42 +0000 Subject: [PATCH 58/66] NVIDIA: VR: SAUCE: resctrl/mpam: track MBW max hard-limit in config ANBZ: #36714 commit d2acb9656c2062d8d5efee54bde507bd78886fa6 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Add mbw_max_hardlim to mpam_config. When reprogramming MBW_MAX, OR in HARDLIM (MPAMCFG_MBW_MAX bit 31) when either MBW_MAX or HARDLIM_RW features are active. Merge HARDLIM in mpam_update_config(). (forward ported from commit 114894b316e4f01d12979c66f58177b5af1b144d https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; - Resolve minor conflicts in `drivers/resctrl/mpam_internal.h`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_devices.c | 15 ++++++++++++--- drivers/resctrl/mpam_internal.h | 1 + 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 029d89cb4da4..d2773ee5d448 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1650,10 +1650,17 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, } if (mpam_has_feature(mpam_feat_mbw_max, rprops)) { - if (mpam_has_feature(mpam_feat_mbw_max, cfg)) - mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); - else + if (mpam_has_feature(mpam_feat_mbw_max, cfg) || + mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, cfg)) { + u32 mbw_val = cfg->mbw_max; + + if (mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, cfg) && + cfg->mbw_max_hardlim) + mbw_val |= MPAMCFG_MBW_MAX_HARDLIM; + mpam_write_partsel_reg(msc, MBW_MAX, mbw_val); + } else { mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); + } } if (mpam_has_feature(mpam_feat_mbw_prop, rprops)) @@ -3146,6 +3153,8 @@ static bool mpam_update_config(struct mpam_config *cfg, maybe_update_config(cfg, mpam_feat_cmax_cmax, newcfg, cmax, has_changes); maybe_update_config(cfg, mpam_feat_mbw_part, newcfg, mbw_pbm, has_changes); maybe_update_config(cfg, mpam_feat_mbw_max, newcfg, mbw_max, has_changes); + maybe_update_config(cfg, mpam_feat_mbw_max_hardlim_rw, newcfg, + mbw_max_hardlim, has_changes); return has_changes; } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 402c4f8b7e62..b463771bb0d5 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -361,6 +361,7 @@ struct mpam_config { u32 cpbm; u32 mbw_pbm; u16 mbw_max; + bool mbw_max_hardlim; u16 cmax; u16 cmin; -- Gitee From 6cbcd2bae692d50a96b53a711f7205bc55057631 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:42 +0000 Subject: [PATCH 59/66] NVIDIA: VR: SAUCE: resctrl/mpam: bind MB_HLIM resource to MBA MPAM class ANBZ: #36714 commit 00999657322bbc3560d71c3448221c7dade2a12d NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 When mpam_feat_mbw_max_hardlim_rw is present, share the MBA MPAM class with RDT_RESOURCE_MB_HLIM. Wire resctrl_arch_get_config and resctrl_arch_update_one for MB_HLIM, and tear down MBA and MB_HLIM controls together on class removal. Handle RDT_RESOURCE_MB_HLIM in fflags_from_resource() so creating info/MB_HLIM at resctrl mount does not hit the unhandled-rid WARN. (forward ported from commit 04e2ea34385215db0e00716ad4553c15084382f6 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 53 ++++++++++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 2 ++ 2 files changed, 55 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index df53406cbe3c..7c909dec7cc8 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1210,6 +1210,13 @@ static void mpam_resctrl_pick_mba(void) candidate_class->level); res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; res->class = candidate_class; + if (mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, + &candidate_class->props)) { + struct mpam_resctrl_res *mbh = + &mpam_resctrl_controls[RDT_RESOURCE_MB_HLIM]; + + mbh->class = candidate_class; + } } } @@ -1501,6 +1508,14 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->ctrl_scope = RESCTRL_L3_CACHE; } + break; + case RDT_RESOURCE_MB_HLIM: + if (!mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, cprops)) + break; + r->alloc_capable = true; + r->schema_fmt = RESCTRL_SCHEMA_MB_HLIM; + r->ctrl_scope = RESCTRL_L3_CACHE; + r->name = "MB_HLIM"; break; case RDT_RESOURCE_MBA: r->schema_fmt = RESCTRL_SCHEMA_PERCENT; @@ -1650,6 +1665,33 @@ static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, return 0; } +/* MB_HLIM schemata read: 0/1 per domain for current closid. */ +static u32 mpam_read_mbw_max_hardlim(struct rdt_resource *r, struct rdt_ctrl_domain *dom, + u32 closid, enum resctrl_conf_type type) +{ + struct mpam_resctrl_dom *m_dom; + struct mpam_config *cfg; + u32 partid; + + if (!mpam_is_enabled() || r->rid != RDT_RESOURCE_MB_HLIM) + return 0; + + partid = resctrl_get_config_index(closid, type); + if (partid >= resctrl_arch_get_num_closid(r)) + return 0; + + m_dom = container_of(dom, struct mpam_resctrl_dom, resctrl_ctrl_dom); + if (!m_dom->ctrl_comp || !m_dom->ctrl_comp->cfg) + return 0; + + cfg = &m_dom->ctrl_comp->cfg[partid]; + if (!mpam_has_feature(mpam_feat_mbw_max, cfg) && + !mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, cfg)) + return 0; + + return cfg->mbw_max_hardlim ? 1 : 0; +} + u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { @@ -1689,6 +1731,8 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case RDT_RESOURCE_L3_MAX: configured_by = mpam_feat_cmax_cmax; break; + case RDT_RESOURCE_MB_HLIM: + return mpam_read_mbw_max_hardlim(r, d, closid, type); case RDT_RESOURCE_MBA: if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { configured_by = mpam_feat_mbw_max; @@ -1771,6 +1815,15 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, mpam_set_feature(mpam_feat_mbw_max, &cfg); break; } + return -EINVAL; + case RDT_RESOURCE_MB_HLIM: + if (mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, cprops) && + mpam_has_feature(mpam_feat_mbw_max, cprops)) { + cfg.mbw_max_hardlim = cfg_val != 0; + mpam_set_feature(mpam_feat_mbw_max_hardlim_rw, &cfg); + mpam_set_feature(mpam_feat_mbw_max, &cfg); + break; + } fallthrough; default: return -EINVAL; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 53a6d4d33b01..502ed7b21744 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2573,6 +2573,8 @@ static unsigned long fflags_from_resource(struct rdt_resource *r) return RFTYPE_RES_MB; case RDT_RESOURCE_PERF_PKG: return RFTYPE_RES_PERF_PKG; + case RDT_RESOURCE_MB_HLIM: + return 0; } return 0; -- Gitee From cf07e0a3d3709ae2fabf2d2705ddcdf7c31283da Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:47 +0000 Subject: [PATCH 60/66] NVIDIA: VR: SAUCE: resctrl: add membw.mb_max_lim and arch_has_mb_max_lim ANBZ: #36714 commit bb32f97fc61e055f72e33ca0f40fa20b3522be7e NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Add mb_max_lim and arch_has_mb_max_lim to struct resctrl_membw with kernel-doc: MPAM MAX_LIM encoding (MPAMF_MBW_IDR), invalid elsewhere unless arch sets arch_has_mb_max_lim. (cherry picked from commit 7939f1e41019d139cc11f2b93d6e8bd7ef4663d1 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- include/linux/resctrl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 99ecfa8764fc..9ef861e795f8 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -252,12 +252,16 @@ enum membw_throttle_mode { * @max_bw: Maximum memory bandwidth value, used as the reset value * @bw_gran: Granularity at which the memory bandwidth is allocated * @hwdrc: True if memory bandwidth HWDRC is enabled + * @mb_max_lim: MPAM MAX_LIM encoding (MPAMF_MBW_IDR); invalid elsewhere + * @arch_has_mb_max_lim:True if mb_max_lim is supported */ struct resctrl_membw { u32 min_bw; u32 max_bw; u32 bw_gran; bool hwdrc; + u8 mb_max_lim; + bool arch_has_mb_max_lim; }; /** -- Gitee From 2b84954eafa13317360c1eb91064dc718cf64a28 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:47 +0000 Subject: [PATCH 61/66] NVIDIA: VR: SAUCE: resctrl/mpam: populate MBA mb_max_lim from MPAM probe ANBZ: #36714 commit 01d8a8024e15e8884f36353bad084f6219e940ed NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 When mpam_feat_mbw_max is present, set membw.mb_max_lim from cprops->mbw_max_lim and membw.arch_has_mb_max_lim so generic resctrl can expose max_lim. (forward ported from commit 05b9bc18310115182b70ec063229f20d601dc32c https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 7c909dec7cc8..71d3fb80783c 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1526,6 +1526,11 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->membw.max_bw = MAX_MBA_BW; r->membw.bw_gran = get_mba_granularity(cprops); + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + r->membw.mb_max_lim = cprops->mbw_max_lim; + r->membw.arch_has_mb_max_lim = true; + } + r->name = "MB"; r->alloc_capable = true; break; -- Gitee From e92b98fae9af7fcf9158019dbd841d697bb25770 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:47 +0000 Subject: [PATCH 62/66] NVIDIA: VR: SAUCE: resctrl: add MBA max_lim sysfs and visibility from init ANBZ: #36714 commit 50a85e171736f70f48bacf9022b9923bed203090 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Add rdt_mb_max_lim_show() and the max_lim entry in res_common_files[]. Add mb_max_lim_init() to call resctrl_file_fflags_init("max_lim", ...) when arch_has_mb_max_lim, and invoke it from resctrl_init() after io_alloc_init(). (forward ported from commit be13cad9cd1530470c9c7d96cb3665f7da8e1873 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/rdtgroup.c`; ] Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- fs/resctrl/rdtgroup.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 502ed7b21744..b71aa1e5b655 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1303,6 +1303,17 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, return 0; } +static int rdt_mb_max_lim_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%d\n", r->membw.mb_max_lim); + + return 0; +} + static int max_threshold_occ_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -2131,6 +2142,12 @@ static struct rftype res_common_files[] = { .seq_show = rdt_delay_linear_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, + { + .name = "max_lim", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mb_max_lim_show, + }, /* * Platform specific which (if any) capabilities are provided by * thread_throttle_mode. Defer "fflags" initialization to platform @@ -2379,6 +2396,17 @@ static void io_alloc_init(void) } } +/* The resctrl file "max_lim" is added using MB resource if visible. */ +static void mb_max_lim_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + + if (!r->membw.arch_has_mb_max_lim) + return; + + resctrl_file_fflags_init("max_lim", RFTYPE_CTRL_INFO | RFTYPE_RES_MB); +} + void resctrl_file_fflags_init(const char *config, unsigned long fflags) { struct rftype *rft; @@ -4922,6 +4950,8 @@ int resctrl_init(void) io_alloc_init(); + mb_max_lim_init(); + ret = resctrl_mon_init(); if (ret) return ret; -- Gitee From 1151060d99b3e4428d9a6addfdd9aa8b37325dc9 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:48 +0000 Subject: [PATCH 63/66] NVIDIA: VR: SAUCE: Documentation: resctrl: document max_lim and MB_HLIM for MPAM MBA ANBZ: #36714 commit 816240a7fa16417c330b7d54f6bfe16b52f7599c NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Document the MBA max_lim sysfs file, MB_HLIM schemata (0/1 per domain), and how they relate to MPAM MBW_MAX, HARDLIM, and MPAMF_MBW_IDR.MAX_LIM. Add schema_format for mb_hlim under the MB allocation info directory. max_lim is exposed as a single decimal integer (MPAMF_MBW_IDR.MAX_LIM [1:0], 0-3), matching rdt_mb_max_lim_show(). MB_HLIM appears when the probe treats HARDLIM as read/write, which this series ties to max_lim reading zero (see mpam_props_sync_mbw_max_hardlim_rw()). (cherry picked from commit 93e1b6a2b3729e1e2db818036829c498378b4f16 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- Documentation/filesystems/resctrl.rst | 34 +++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index e9ff59c2e57e..26739383c015 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -247,6 +247,19 @@ with respect to allocation: non-linear. This field is purely informational only. +"max_lim": + Read-only. On ARM MPAM systems where MBA exposes MBW_MAX, this + file contains a single decimal integer: the + ``MPAMF_MBW_IDR.MAX_LIM`` field [1:0] (values ``0``–``3``) as probed for the MBA resource. + The file appears only when the platform supports MBA MBW_MAX and + the MAX_LIM value is available; otherwise it is not listed. + + The Arm MPAM architecture defines the meaning of each MAX_LIM + encoding. In this kernel, when ``max_lim`` reads ``0``, the + driver treats the ``HARDLIM`` bit of ``MPAMCFG_MBW_MAX`` as + read/write and an optional ``MB_HLIM`` line may appear in + ``schemata``. When ``max_lim`` is nonzero, ``MB_HLIM`` is omitted. + "thread_throttle_mode": Indicator on Intel systems of how tasks running on threads of a physical core are throttled in cases where they @@ -963,6 +976,27 @@ Memory bandwidth domain is L3 cache. MB:=bw_MiBps0;=bw_MiBps1;... +MBW maximum hard limit (ARM MPAM) +--------------------------------- +On some ARM systems, resctrl memory bandwidth allocation uses MPAM +maximum bandwidth (MBW_MAX). When ``max_lim`` reads ``0`` (see ``max_lim`` +under the ``MB`` allocation ``info`` directory), an additional schemata +line selects the ``HARDLIM`` bit for ``MPAMCFG_MBW_MAX`` independently of +the numeric limit on the ``MB`` line. + +The line uses the same cache/domain indices as ``MB``. Each value must +be ``0`` or ``1``: ``0`` clears HARDLIM (soft-limit behaviour for the +max), ``1`` sets HARDLIM (hard limit). When ``max_lim`` is nonzero or +``MB_HLIM`` is not supported for the platform, the line is omitted from +``schemata``. + +Format: +:: + + MB_HLIM:=0|1;=0|1;... + +The corresponding ``schema_format`` entry under ``info`` is ``mb_hlim``. + Slow Memory Bandwidth Allocation (SMBA) --------------------------------------- AMD hardware supports Slow Memory Bandwidth Allocation (SMBA). -- Gitee From 450f02980132a8c6eef5875504d5049c171af985 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 13 May 2026 23:30:03 +0000 Subject: [PATCH 64/66] NVIDIA: SAUCE: Fix mbm_L3_assign and mon_local_bytes ANBZ: #36714 commit b35ada977bef327f8276cf44a4fe993f6754d362 NV-kernel BugLink: https://bugs.launchpad.net/bugs/2154527 Add local bytes counter in mpam_resctrl_counters[] to fix missing mbm_local_bytes monitoring on Grace. Add mon->assigned_counters check to enable mbm_L3_assignments config file on Grace. Signed-off-by: Fenghua Yu Acked-by: Jamie Nguyen Acked-by: Carol L Soto Signed-off-by: Brad Figg Signed-off-by: Jay Chen --- drivers/resctrl/mpam_resctrl.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 71d3fb80783c..40e0de181db9 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -46,7 +46,7 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; * to those supported by MPAM. * Class pointer may be NULL. */ -#define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID +#define MPAM_MAX_EVENT QOS_L3_MBM_LOCAL_EVENT_ID static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; #define for_each_mpam_resctrl_mon(mon, eventid) \ @@ -175,21 +175,6 @@ static void resctrl_reset_task_closids(void) read_unlock(&tasklist_lock); } -static struct mpam_resctrl_mon *mpam_resctrl_mon_from_res(struct mpam_resctrl_res *res) -{ - struct mpam_resctrl_mon *mon; - enum resctrl_event_id eventid; - - if (!res->class) - return NULL; - - for_each_mpam_resctrl_mon(mon, eventid) { - if (mon->class == res->class) - return mon; - } - return NULL; -} - static struct mpam_resctrl_res *mpam_resctrl_res_from_mon(struct mpam_resctrl_mon *mon) { struct mpam_resctrl_res *res; @@ -1437,15 +1422,22 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain * bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { struct mpam_resctrl_res *res; - struct mpam_resctrl_mon *mon; + enum resctrl_event_id evt; res = container_of(r, struct mpam_resctrl_res, resctrl_res); - mon = mpam_resctrl_mon_from_res(res); - if (!mon) - return false; + /* OCCUP shares the L3 class but has no MBWU assigned_counters. */ + for (evt = QOS_L3_MBM_TOTAL_EVENT_ID; evt <= QOS_L3_MBM_LOCAL_EVENT_ID; + evt++) { + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evt]; - return mon->assigned_counters ? true : false; + if (!mon->assigned_counters) + continue; + if (mpam_resctrl_res_from_mon(mon) == res) + return true; + } + + return false; } int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) -- Gitee From ab82537b19fcf39d637e0d5ed66585fab3d3b795 Mon Sep 17 00:00:00 2001 From: Jiankang Chen Date: Tue, 2 Jun 2026 14:37:12 +0800 Subject: [PATCH 65/66] anolis: configs/arm64: enable CONFIG_ARM64_MPAM ANBZ: #36714 Turn on MPAM (Memory Partitioning and Monitoring) on arm64 so the NVIDIA Grace MPAM resctrl stack ported in this series is actually built. Promote CONFIG_ARM64_MPAM to L0-MANDATORY/arm64 and enable the driver-layer chain explicitly: - CONFIG_ARM64_MPAM=y (L0-MANDATORY/arm64) - CONFIG_RESCTRL_FS not set (L0-MANDATORY/arm64, placeholder; promoted to default in the next commit) - CONFIG_ACPI_MPAM=y firmware (ACPI MPAM table) - CONFIG_ARM64_MPAM_DRIVER=y MPAM CPU/MSC driver - CONFIG_ARM64_MPAM_DRIVER_DEBUG not set - CONFIG_RESCTRL_IOMMU=y SMMU MPAM integration CONFIG_ARCH_HAS_CPU_RESCTRL is now selected by both x86 and arm64, so move it from L2-OPTIONAL/x86/ to L2-OPTIONAL/default/ and pin it 'not set' on architectures that do not provide a resctrl backend (loongarch, riscv). Signed-off-by: Jay Chen --- anolis/configs/L0-MANDATORY/arm64/CONFIG_ARM64_MPAM | 1 + anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS | 1 + anolis/configs/L2-OPTIONAL/arm64/CONFIG_ACPI_MPAM | 1 + anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM | 1 - anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER | 1 + anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER_DEBUG | 1 + anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_IOMMU | 1 + .../L2-OPTIONAL/{x86 => default}/CONFIG_ARCH_HAS_CPU_RESCTRL | 0 anolis/configs/L2-OPTIONAL/loongarch/CONFIG_ARCH_HAS_CPU_RESCTRL | 1 + anolis/configs/L2-OPTIONAL/riscv/CONFIG_ARCH_HAS_CPU_RESCTRL | 1 + 10 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 anolis/configs/L0-MANDATORY/arm64/CONFIG_ARM64_MPAM create mode 100644 anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_ACPI_MPAM delete mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER_DEBUG create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_IOMMU rename anolis/configs/L2-OPTIONAL/{x86 => default}/CONFIG_ARCH_HAS_CPU_RESCTRL (100%) create mode 100644 anolis/configs/L2-OPTIONAL/loongarch/CONFIG_ARCH_HAS_CPU_RESCTRL create mode 100644 anolis/configs/L2-OPTIONAL/riscv/CONFIG_ARCH_HAS_CPU_RESCTRL diff --git a/anolis/configs/L0-MANDATORY/arm64/CONFIG_ARM64_MPAM b/anolis/configs/L0-MANDATORY/arm64/CONFIG_ARM64_MPAM new file mode 100644 index 000000000000..45957b7b4ea2 --- /dev/null +++ b/anolis/configs/L0-MANDATORY/arm64/CONFIG_ARM64_MPAM @@ -0,0 +1 @@ +CONFIG_ARM64_MPAM=y diff --git a/anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS b/anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS new file mode 100644 index 000000000000..2147c5f9ea80 --- /dev/null +++ b/anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS @@ -0,0 +1 @@ +# CONFIG_RESCTRL_FS is not set diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ACPI_MPAM b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ACPI_MPAM new file mode 100644 index 000000000000..e93cbd36cedc --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ACPI_MPAM @@ -0,0 +1 @@ +CONFIG_ACPI_MPAM=y diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM deleted file mode 100644 index c34b76e4f406..000000000000 --- a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM +++ /dev/null @@ -1 +0,0 @@ -# CONFIG_ARM64_MPAM is not set diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER new file mode 100644 index 000000000000..9e4b32224138 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER @@ -0,0 +1 @@ +CONFIG_ARM64_MPAM_DRIVER=y diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER_DEBUG b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER_DEBUG new file mode 100644 index 000000000000..76eca7c2ff09 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER_DEBUG @@ -0,0 +1 @@ +# CONFIG_ARM64_MPAM_DRIVER_DEBUG is not set diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_IOMMU b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_IOMMU new file mode 100644 index 000000000000..425eb50c311e --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_IOMMU @@ -0,0 +1 @@ +CONFIG_RESCTRL_IOMMU=y diff --git a/anolis/configs/L2-OPTIONAL/x86/CONFIG_ARCH_HAS_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/default/CONFIG_ARCH_HAS_CPU_RESCTRL similarity index 100% rename from anolis/configs/L2-OPTIONAL/x86/CONFIG_ARCH_HAS_CPU_RESCTRL rename to anolis/configs/L2-OPTIONAL/default/CONFIG_ARCH_HAS_CPU_RESCTRL diff --git a/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_ARCH_HAS_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_ARCH_HAS_CPU_RESCTRL new file mode 100644 index 000000000000..dd3c6353e127 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_ARCH_HAS_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_ARCH_HAS_CPU_RESCTRL is not set diff --git a/anolis/configs/L2-OPTIONAL/riscv/CONFIG_ARCH_HAS_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/riscv/CONFIG_ARCH_HAS_CPU_RESCTRL new file mode 100644 index 000000000000..dd3c6353e127 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/riscv/CONFIG_ARCH_HAS_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_ARCH_HAS_CPU_RESCTRL is not set -- Gitee From 3b0fd2bb108b900fe9d707129672be5e7cdd2aef Mon Sep 17 00:00:00 2001 From: Jiankang Chen Date: Tue, 2 Jun 2026 15:48:23 +0800 Subject: [PATCH 66/66] anolis: configs/default: enable CONFIG_RESCTRL_FS ANBZ: #36714 Promote the resctrl filesystem to the default baseline now that arm64 (via MPAM, enabled in the previous commit) joins x86 as a CONFIG_RESCTRL_FS provider. Pin it 'not set' on architectures without a resctrl backend. L0-MANDATORY: - L0-MANDATORY/x86/CONFIG_RESCTRL_FS -> L0-MANDATORY/default/ (=y) - L0-MANDATORY/arm64/CONFIG_RESCTRL_FS -> L0-MANDATORY/loongarch/ (not set) - L0-MANDATORY/riscv/CONFIG_RESCTRL_FS added (not set) L2-OPTIONAL/arm64 (resctrl-fs side of MPAM): - CONFIG_ARM64_MPAM_RESCTRL_FS=y resctrl backend - CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID=y MPAM monitor allocation L2-OPTIONAL/default (CONFIG_PROC_CPU_RESCTRL is now also selected by arm64): - L2-OPTIONAL/x86/CONFIG_PROC_CPU_RESCTRL -> L2-OPTIONAL/default/ (=y) - L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL added (not set) - L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL added (not set) Signed-off-by: Jay Chen --- anolis/configs/L0-MANDATORY/{x86 => default}/CONFIG_RESCTRL_FS | 0 .../configs/L0-MANDATORY/{arm64 => loongarch}/CONFIG_RESCTRL_FS | 0 anolis/configs/L0-MANDATORY/riscv/CONFIG_RESCTRL_FS | 1 + anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_RESCTRL_FS | 1 + .../L2-OPTIONAL/arm64/CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID | 1 + .../configs/L2-OPTIONAL/{x86 => default}/CONFIG_PROC_CPU_RESCTRL | 0 anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL | 1 + anolis/configs/L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL | 1 + 8 files changed, 5 insertions(+) rename anolis/configs/L0-MANDATORY/{x86 => default}/CONFIG_RESCTRL_FS (100%) rename anolis/configs/L0-MANDATORY/{arm64 => loongarch}/CONFIG_RESCTRL_FS (100%) create mode 100644 anolis/configs/L0-MANDATORY/riscv/CONFIG_RESCTRL_FS create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_RESCTRL_FS create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID rename anolis/configs/L2-OPTIONAL/{x86 => default}/CONFIG_PROC_CPU_RESCTRL (100%) create mode 100644 anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL create mode 100644 anolis/configs/L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL diff --git a/anolis/configs/L0-MANDATORY/x86/CONFIG_RESCTRL_FS b/anolis/configs/L0-MANDATORY/default/CONFIG_RESCTRL_FS similarity index 100% rename from anolis/configs/L0-MANDATORY/x86/CONFIG_RESCTRL_FS rename to anolis/configs/L0-MANDATORY/default/CONFIG_RESCTRL_FS diff --git a/anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS b/anolis/configs/L0-MANDATORY/loongarch/CONFIG_RESCTRL_FS similarity index 100% rename from anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS rename to anolis/configs/L0-MANDATORY/loongarch/CONFIG_RESCTRL_FS diff --git a/anolis/configs/L0-MANDATORY/riscv/CONFIG_RESCTRL_FS b/anolis/configs/L0-MANDATORY/riscv/CONFIG_RESCTRL_FS new file mode 100644 index 000000000000..2147c5f9ea80 --- /dev/null +++ b/anolis/configs/L0-MANDATORY/riscv/CONFIG_RESCTRL_FS @@ -0,0 +1 @@ +# CONFIG_RESCTRL_FS is not set diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_RESCTRL_FS b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_RESCTRL_FS new file mode 100644 index 000000000000..c91ce4ffbafa --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_RESCTRL_FS @@ -0,0 +1 @@ +CONFIG_ARM64_MPAM_RESCTRL_FS=y diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID new file mode 100644 index 000000000000..8cddb03cb135 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID @@ -0,0 +1 @@ +CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID=y diff --git a/anolis/configs/L2-OPTIONAL/x86/CONFIG_PROC_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/default/CONFIG_PROC_CPU_RESCTRL similarity index 100% rename from anolis/configs/L2-OPTIONAL/x86/CONFIG_PROC_CPU_RESCTRL rename to anolis/configs/L2-OPTIONAL/default/CONFIG_PROC_CPU_RESCTRL diff --git a/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL new file mode 100644 index 000000000000..b4dd102b0a4e --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_PROC_CPU_RESCTRL is not set diff --git a/anolis/configs/L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL new file mode 100644 index 000000000000..b4dd102b0a4e --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_PROC_CPU_RESCTRL is not set -- Gitee